diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 49dacb6fb1..7dfd46c07f 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -513,7 +513,47 @@ process: mem_required: '9GB' - whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace. + + - video_human_tracks_extraction_mapper: # Get the body and face trajectory bounding box of people in one shot of the video. To ensure correctness, it should be applied after video_split_by_scene_mapper + face_track_bbox_path: your_path_to_save_bounding_box_data + YOLOv8_human_model_path: ./data_juicer/my_pretrained_method/YOLOv8_human/weights/best.pt + mem_required: '10GB' + + - video_active_speaker_mapper: # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker + tempt_save_path: ./HumanVBenchRecipe/dj_ASD_tempt # Used to store temporary videos + face_track_bbox_path: ./HumanVBenchRecipe/dj_human_track # Human track Data storage address in video_human_tracks_extraction_mapper + mem_required: '10GB' + + - video_audio_attribute_mapper: # If the audio is speech, classify the gender and age of the speech + hf_audio_mapper: 'pt_model/wav2vec2-large-robust-24-ft-age-gender' # Huggingface model name for speech age and gender classification + mem_required: '7GB' + + - video_captioning_from_human_tracks_mapper: # Based on the results of video_human_tracks_extraction_mapper, focus on the single person in the video for captioning + video_describe_model_path: pt_model/sharegpt4video-8b # model path to sharegpt4video-8b + tempt_video_path: data_juicer/HumanVBenchRecipe/dj_tmpt # Used to store temporary videos + mem_required: '35GB' + + - video_captioning_face_attribute_emotion_mapper: # Based on the results of video_human_tracks_extraction_mapper, focus on judging the gender, age, and race of a single person in the video + face_track_query: Please only describe the appearance and facial emotions of the person in the video in detail. Don't mention the background. Less than 80 words. + cropping_face_video_tempt_path: ./tempt_video/tmp_video_remove # Used to store temporary videos + video_describe_model_path: 'pt_model/VideoLLaMA2' # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F + mem_required: '35GB' + + - video_audio_speech_ASR_mapper: # Automatic speech recognition from video speech + model_dir_ASR: 'pt_model/SenseVoiceSmall' # Huggingface model FunAudioLLM/SenseVoiceSmall + mem_required: '20GB' + + - video_audio_speech_emotion_mapper: # Speech emotion recognition from video speech + model_dir_emo: 'pt_model/SenseVoiceSmall' # # Huggingface model FunAudioLLM/SenseVoiceSmall + mem_required: '20GB' + + # Filter ops + - video_face_ratio_filter: # Filter to retain human-centric videos + threshold: 0.65 # The lower limit of the ratio of frames with faces to the total number of video frames + detect_interval: 4 + any_or_all: any + - alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range. tokenization: false # whether to count the ratio of alphanumeric to the total number of tokens. min_ratio: 0.0 # the min ratio of filter range diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index 8cb986b2b3..259284c747 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -43,6 +43,7 @@ from .video_watermark_filter import VideoWatermarkFilter from .word_repetition_filter import WordRepetitionFilter from .words_num_filter import WordsNumFilter +from .video_face_ratio_filter import VideoFaceRatioFilter __all__ = [ 'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter', @@ -61,7 +62,7 @@ 'VideoMotionScoreFilter', 'VideoMotionScoreRaftFilter', 'VideoNSFWFilter', 'VideoOcrAreaRatioFilter', 'VideoResolutionFilter', 'VideoTaggingFromFramesFilter', 'VideoWatermarkFilter', - 'WordRepetitionFilter', 'WordsNumFilter' + 'WordRepetitionFilter', 'WordsNumFilter', 'VideoFaceRatioFilter' ] NON_STATS_FILTERS = [ diff --git a/data_juicer/ops/filter/video_face_ratio_filter.py b/data_juicer/ops/filter/video_face_ratio_filter.py new file mode 100644 index 0000000000..2b41c92eb3 --- /dev/null +++ b/data_juicer/ops/filter/video_face_ratio_filter.py @@ -0,0 +1,142 @@ +import av +import numpy as np +from jsonargparse.typing import ClosedUnitInterval +from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.mm_utils import (load_data_with_context, load_video, + pil_to_opencv, pil_to_opencv, process_each_frame) +from ..base_op import OPERATORS, Filter +from ..op_fusion import LOADED_VIDEOS + +import psutil +import gc,os + +OP_NAME = 'video_face_ratio_filter' + +with AvailabilityChecking(['dlib', 'Pillow'], OP_NAME): + import cv2,dlib + from PIL import ImageFilter + +@OPERATORS.register_module(OP_NAME) +@LOADED_VIDEOS.register_module(OP_NAME) +class VideoFaceRatioFilter(Filter): + """Keep data samples whose videos' durations are within a specified range. + """ + + def __init__(self, + threshold: ClosedUnitInterval = 0.8, + detect_interval: int = 1, + any_or_all: str = 'all', + *args, + **kwargs): + """ + Initialization method. + + :param any_or_all: keep this sample with 'any' or 'all' strategy of + all videos. 'any': keep this sample if any videos meet the + condition. 'all': keep this sample only if all videos meet the + condition. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self.threshold = threshold + + if any_or_all not in ['any', 'all']: + raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' + f'Can only be one of ["any", "all"].') + self.any = (any_or_all == 'any') + + # Initialize face detector + self.detector = dlib.get_frontal_face_detector() + # self.detector_key = prepare_model(model_type='face_detect_S3FD') + + + self.detect_interval = detect_interval + + + def compute_stats(self, sample, rank=None, context=False): + # check if it's computed already + if StatsKeys.video_face_exist in sample[Fields.stats]: + return sample + + # load videos + loaded_video_keys = sample[self.video_key] + video_faces_ratio = {} + + # face_detect_S3FD = get_model(self.detector_key, rank=rank) + + process = psutil.Process(os.getpid()) + # memory_before = process.memory_info().rss / 1024 ** 2 # MB + + + for video_key in loaded_video_keys: + try: + with av.open(video_key) as container: + # getting video stream + video_stream = next(s for s in container.streams if s.type == 'video') + # iterate over the video frame and detect faces + frame_counter = 0 + total_frames = 0 + frames_with_face = 0 + detect_num = 0 + for packet in container.demux(video_stream): + try: + for frame in packet.decode(): + total_frames += 1 + frame_counter += 1 + + if frame_counter % self.detect_interval == 0: + detect_num = detect_num + 1 + img = frame.to_image() + image = pil_to_opencv(img) + # imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + # faces = face_detect_S3FD.detect_faces(imageNumpy, conf_th=0.9, scales=[0.25]) + faces = self.detector(image) + if len(faces) > 0: + frames_with_face += 1 + except Exception as e: + print(f"Frame decoding error in video {video_key}: {e}") + frames_with_face = 0 + detect_num = 0 + + # calculate the proportion of the number of face frames + if detect_num > 0: + face_ratio = frames_with_face / detect_num + else: + face_ratio = 0.0 + video_faces_ratio[video_key] = face_ratio + except av.AVError as e: + print(f"Error opening video {video_key}: {e}") + video_faces_ratio[video_key] = 0.0 + finally: + container.close() + + video_faces_ratio[video_key] = face_ratio + + # get video faces ratio + sample[Fields.stats][StatsKeys.video_face_exist] = [ + video_faces_ratio[video_key] for video_key in sample[self.video_key] + ] + + memory_after = process.memory_info().rss / 1024 ** 2 # MB + print(f"Memory Usage: {memory_after:.2f} MB") + + gc.collect() + + return sample + + def process(self, sample): + video_faces_ratio = sample[Fields.stats][StatsKeys.video_face_exist] + keep_bools = np.array([ + duration >= self.threshold + for duration in video_faces_ratio + ]) + if len(keep_bools) <= 0: + return True + + # different strategies + if self.any: + return keep_bools.any() + else: + return keep_bools.all() diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index 8ffe7cc8e8..026fd10149 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -73,6 +73,13 @@ from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper from .whitespace_normalization_mapper import WhitespaceNormalizationMapper +from .video_active_speaker_mapper import VideoActiveSpeakerMapper +from .video_audio_attribute_mapper import VideoAudioAttributeMapper +from .video_audio_speech_ASR_mapper import VideoAudioSpeechASRMapper +from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper +from .video_captioning_from_human_tracks_mapper import VideoCaptioningFromHumanTracksMapper +from .video_human_tracks_extraction_mapper import VideoHumanTracksExtractionMapper +from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper __all__ = [ 'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper', @@ -105,5 +112,8 @@ 'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper', - 'WhitespaceNormalizationMapper' + 'WhitespaceNormalizationMapper','VideoActiveSpeakerMapper', + 'VideoAudioAttributeMapper', 'VideoAudioSpeechASRMapper', + 'VideoCaptioningFaceAttributeEmotionMapper','VideoCaptioningFromHumanTracksMapper', + 'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper' ] diff --git a/data_juicer/ops/mapper/video_active_speaker_mapper.py b/data_juicer/ops/mapper/video_active_speaker_mapper.py new file mode 100644 index 0000000000..9dfea009a5 --- /dev/null +++ b/data_juicer/ops/mapper/video_active_speaker_mapper.py @@ -0,0 +1,207 @@ +from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.constant import Fields + +from data_juicer.utils.ASD_mapper_utils import get_video_array_cv2,evaluate_network, \ + crop_video_with_facetrack, longest_continuous_actives + +from ..base_op import OPERATORS, Mapper +from ..op_fusion import LOADED_VIDEOS +from data_juicer.utils.model_utils import get_model, prepare_model +import gc,os + +OP_NAME = 'video_active_speaker_mapper' + +with AvailabilityChecking([], OP_NAME): + import torch + import sys + sys.path.append('./data_juicer/my_pretrained_method/Light-ASD') + import tempfile + import shutil, pickle + from shutil import rmtree + import os, subprocess + import tqdm, glob + # from model.faceDetector.s3fd import S3FD + + +@OPERATORS.register_module(OP_NAME) +@LOADED_VIDEOS.register_module(OP_NAME) +class VideoActiveSpeakerMapper(Mapper): + """ + """ + + _default_kwargs = {'upsample_num_times': 0} + + def __init__(self, + tempt_save_path: str = './HumanVBenchRecipe/dj_ASD_tempt', + face_track_bbox_path: str = './HumanVBenchRecipe/dj_human_track', + Light_ASD_model_path: str = 'weight/finetuning_TalkSet.model', + acitve_threshold: int = 15, + *args, + **kwargs): + """ + Initialization method. + + :param blur_type: + """ + super().__init__(*args, **kwargs) + self._accelerator = 'cuda' + self._init_parameters = self.remove_extra_parameters(locals()) + self.acitve_threshold = acitve_threshold + + self.tempt_save_path = tempt_save_path + self.face_track_bbox_path = face_track_bbox_path + + # Initialize ASD model + self.ASD_model_key = prepare_model(model_type='Light_ASD', + pretrained_model_name_or_path=Light_ASD_model_path) + + def active_speaker_detection_revise(self, active_score,is_child_descrip,speech_audio,face_gender): + speech_child = speech_audio['child'][0] + speech_male = speech_audio['male'][0] + speech_female = speech_audio['female'][0] + if speech_male > speech_female: + speech_gender = 'Man' + speech_gender_confidence = speech_male + else: + speech_gender = 'Woman' + speech_gender_confidence = speech_female + + if ' not ' in is_child_descrip: + is_child_apperance = False + else: + is_child_apperance = True + + if speech_child < 0.1: + is_child_voice = False + elif speech_audio['Age'][0]<=12: + is_child_voice = True + else: + is_child_voice = 'Not Sure' + + # Consistency detection: only perform false positive detection on positive samples + if active_score>self.acitve_threshold: + speak_active = True + # age consistency test: + if not is_child_voice == 'Not Sure': + if is_child_apperance == is_child_voice: + # gender consistency test + if speech_gender_confidence > 0.65 and float(face_gender[1]) > 0.65: + if not speech_gender == face_gender[0]: + speak_active = False + else: + speak_active = False + return speak_active + else: + return False + + + def process(self, sample, rank=None): + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + sample[Fields.source_file] = [] + return sample + + if not Fields.video_audio_tags in sample: + raise ValueError("video_active_speaker_mapper must be operated after video_tagging_from_audio_mapper.") + + if not Fields.human_track_data_path in sample: + raise ValueError("video_active_speaker_mapper must be operated after video_human_tracks_extraction_mapper.") + + if not Fields.audio_speech_attribute in sample: + raise ValueError("video_active_speaker_mapper must be operated after audio_speech_attribute.") + + if not Fields.video_facetrack_attribute_demographic in sample: + raise ValueError("video_active_speaker_mapper must be operated after video_facetrack_attribute_demographic.") + + if not Fields.video_facetrack_is_child in sample: + raise ValueError("video_active_speaker_mapper must be operated after video_captioning_from_human_tracks_mapper.") + + loaded_video_keys = sample[self.video_key] + audio_speech_attribute = sample[Fields.audio_speech_attribute] + face_demographic = sample[Fields.video_facetrack_attribute_demographic][0] + child_flag = sample[Fields.video_facetrack_is_child] + + Total_result = [] + + temp_dir = tempfile.mkdtemp(dir=self.tempt_save_path) + pyaviPath = os.path.join(temp_dir, 'pyavi') + pyframesPath = os.path.join(temp_dir, 'pyframes') + pyworkPath = os.path.join(temp_dir, 'pywork') + pycropPath = os.path.join(temp_dir, 'pycrop') + if os.path.exists(temp_dir): + rmtree(temp_dir) + + audio_tag = sample[Fields.video_audio_tags] + asd_detection_model = get_model(self.ASD_model_key, rank=rank) + + for id_out,video_key in enumerate(loaded_video_keys): + os.makedirs(pyaviPath, exist_ok = False) # The path for the input video, input audio, output video + os.makedirs(pyframesPath, exist_ok = False) # Save all the video frames + os.makedirs(pyworkPath, exist_ok = False) # Save the results in this process by the pckl method + os.makedirs(pycropPath, exist_ok = False) # Save the detected face clips (audio+video) in this process + + # Extract audio + audio_is_empty = False + audioFilePath = os.path.join(pyaviPath, 'audio.wav') + command = ("ffmpeg -y -i '%s' -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % \ + (video_key, 10, audioFilePath)) + if audio_tag[id_out] == "EMPTY": + audio_is_empty = True + else: + subprocess.call(command, shell=True, stdout=None) + + + video_array = get_video_array_cv2(video_key) + + def load_pkl(file_path): + with open(file_path, 'rb') as file: + return pickle.load(file) + # get allTracks + allTracks = [load_pkl(item['bbox_path']) for item in sample[Fields.human_track_data_path][id_out]] + + # Face clips cropping + for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)): + result = crop_video_with_facetrack(video_array, track, os.path.join(pycropPath, '%05d' % ii), audioFilePath, audio_is_empty) + if not result: + raise ValueError("something wrong with crop_video_with_facetrack.") + + # Active Speaker Detection + if audio_tag[id_out] == 'Speech': + files = glob.glob("%s/*.avi"%pycropPath) + files.sort() + try: + scores = evaluate_network(files, asd_detection_model, pycropPath) + except: + scores = [[-10000]]* len(allTracks) + + else: + scores = [[-10000]]* len(allTracks) + + for id in range(len(scores)): + allTracks[id]['active_scores'] = scores[id] + + update_track = allTracks + # for validation + # visualization(vidTracks, scores, video_array, pyaviPath) + + shutil.rmtree(temp_dir) + + speak_flag_for_tracks_in_a_video = [] + for track_idx,track_i in enumerate(update_track): + active_count = longest_continuous_actives(track_i['active_scores']) + audio_attri = audio_speech_attribute[id_out][0] + is_child_descrip = child_flag[id_out][track_idx][0] + face_gender = face_demographic[id_out][track_idx]['gender'] + flag = self.active_speaker_detection_revise(active_count, is_child_descrip, audio_attri, face_gender) + speak_flag_for_tracks_in_a_video.append(flag) + + + Total_result.append(speak_flag_for_tracks_in_a_video) + torch.cuda.empty_cache() + + sample[Fields.ASD_revise_flag] = Total_result + + gc.collect() + torch.cuda.empty_cache() + + return sample diff --git a/data_juicer/ops/mapper/video_audio_attribute_mapper.py b/data_juicer/ops/mapper/video_audio_attribute_mapper.py new file mode 100644 index 0000000000..f84cfa2e8d --- /dev/null +++ b/data_juicer/ops/mapper/video_audio_attribute_mapper.py @@ -0,0 +1,96 @@ +import librosa +from data_juicer.utils.constant import Fields +from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.mm_utils import extract_audio_from_video +from data_juicer.my_pretrained_method.audio_code.wav2vec_age_gender import process_func,AgeGenderModel +from ..base_op import OPERATORS, Mapper +from data_juicer.utils.model_utils import get_model, prepare_model + +NAME = 'video_audio_attribute_mapper' +CHECK_PKGS = [ + 'transformers', 'transformers_stream_generator', 'einops', 'accelerate', + 'tiktoken' +] + +with AvailabilityChecking(CHECK_PKGS, NAME): + from data_juicer.utils.model_utils import get_model, prepare_model + + + +@OPERATORS.register_module(NAME) +class VideoAudioAttributeMapper(Mapper): + """Mapper to caption a video according to its audio streams based on + Qwen-Audio model. + """ + + def __init__(self, + hf_audio_mapper: str = None, + *args, **kwargs): + """ + Initialization method. + + :param keep_original_sample: whether to keep the original sample. If + it's set to False, there will be only captioned sample in the + final datasets and the original sample will be removed. It's True + in default. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self._accelerator = 'cuda' + self._model_sampling_rate = 16000 + + self._hf_summarizer = hf_audio_mapper if hf_audio_mapper else 'audeering/wav2vec2-large-robust-24-ft-age-gender' # noqa: E501 + self.model_key = prepare_model( + model_type='huggingface', + pretrained_model_name_or_path=self._hf_summarizer, + ) + + + + + def process(self, sample, rank=None): + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + return [] + + # get paths of all video(s) + loaded_video_keys = sample[self.video_key] + audio_tag = sample['__dj__video_audio_tags__'] + + Total_result = [] + # get models + model, processor = get_model(self.model_key, rank=rank) + + for i,video in enumerate(loaded_video_keys): + audio_tag_this = audio_tag[i] + if not audio_tag_this == 'Speech': + Total_result.append([]) + else: + ys, srs, valid_indexes = extract_audio_from_video( + video, stream_indexes=[0]) + if len(valid_indexes) == 0: + # there is no valid audio streams. Skip! + Total_result.append([]) + continue + + # inference + y = ys[0] + sr = srs[0] + # check if it meets the sampling rate condition of the model + if sr != self._model_sampling_rate: + y = librosa.resample(y, + orig_sr=sr, + target_sr=self._model_sampling_rate) + sr = self._model_sampling_rate + + Age_female_male_child = process_func(y, sr, processor, model, device=model.device)[0] + Age_female_male_child_dict = {} + Age_female_male_child_dict['Age'] = [int(Age_female_male_child[0]*100)] + Age_female_male_child_dict['female'] = [Age_female_male_child[1]] + Age_female_male_child_dict['male'] = [Age_female_male_child[2]] + Age_female_male_child_dict['child'] = [Age_female_male_child[3]] + Total_result.append([Age_female_male_child_dict]) + + sample[Fields.audio_speech_attribute] = Total_result + return sample diff --git a/data_juicer/ops/mapper/video_audio_speech_ASR_mapper.py b/data_juicer/ops/mapper/video_audio_speech_ASR_mapper.py new file mode 100644 index 0000000000..38359fa137 --- /dev/null +++ b/data_juicer/ops/mapper/video_audio_speech_ASR_mapper.py @@ -0,0 +1,99 @@ +import librosa + +from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.constant import Fields +from data_juicer.utils.mm_utils import extract_audio_from_video +from data_juicer.utils.model_utils import get_model, prepare_model + +from ..base_op import OPERATORS, Mapper +import gc + + +OP_NAME = 'video_audio_speech_ASR_mapper' + +with AvailabilityChecking(['torch', 'transformers', 'torchaudio'], OP_NAME): + import torch + torch.set_num_threads(1) + + +@OPERATORS.register_module(OP_NAME) +class VideoAudioSpeechASRMapper(Mapper): + """Mapper to generate video tags from audio streams extracted by video + using the Audio Spectrogram Transformer. + """ + + def __init__(self, + model_dir_ASR='/mnt1/daoyuan_mm/SenseVoiceSmall', + *args, + **kwargs): + """ + Initialization method. + + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self._batched_op = True + self._accelerator = 'cuda' + self._model_sampling_rate = 16000 + self.model_dir_ASR = model_dir_ASR + + self.model_key = prepare_model( + model_type='SenseVoiceSmall', + pretrained_model_name_or_path=self.model_dir_ASR, + ) + + def process(self, sample, rank=None): + # check if it's generated already + if Fields.speech_ASR in sample: + return sample + + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + sample[Fields.video_audio_tags] = [] + return sample + + # load video paths + loaded_video_keys = sample[self.video_key][0] + audio_tags = sample[Fields.video_audio_tags][0] + + # model, feature_extractor = get_model(self.model_key, rank=rank) + video_audio_tags = [] + for id,video_path in enumerate(loaded_video_keys): + if audio_tags[id] == 'Speech': + # only extract audio data and sr for index 0 for now + ys, srs, valid_indexes = extract_audio_from_video( + video_path, stream_indexes=[0]) + if len(valid_indexes) == 0: + # there is no valid audio streams. Skip! + video_audio_tags.append(self._no_audio_label) + continue + + # inference + y = ys[0] + sr = srs[0] + # check if it meets the sampling rate condition of the model + if sr != self._model_sampling_rate: + y = librosa.resample(y, + orig_sr=sr, + target_sr=self._model_sampling_rate) + sr = self._model_sampling_rate + + ASR_model, kwargs1= get_model(self.model_key, rank=rank) + inputs = torch.tensor(y).to(next(ASR_model.parameters()).device) + with torch.no_grad(): + output_ASR_emo = ASR_model.inference( + data_in=inputs, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=False, + **kwargs1, + ) + + video_audio_tags.append({'language':output_ASR_emo[0][0]['text'].split('<|',1)[-1].split('|>')[0], 'asr': output_ASR_emo[0][0]['text'].split('|>',4)[-1]}) + else: + video_audio_tags.append('') + + sample[Fields.speech_ASR] = video_audio_tags + gc.collect() + torch.cuda.empty_cache() + return sample diff --git a/data_juicer/ops/mapper/video_audio_speech_emotion_mapper.py b/data_juicer/ops/mapper/video_audio_speech_emotion_mapper.py new file mode 100644 index 0000000000..61d619b873 --- /dev/null +++ b/data_juicer/ops/mapper/video_audio_speech_emotion_mapper.py @@ -0,0 +1,101 @@ +import librosa + +from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.constant import Fields +from data_juicer.utils.mm_utils import extract_audio_from_video +from data_juicer.utils.model_utils import get_model, prepare_model + +from ..base_op import OPERATORS, Mapper +import gc + +from data_juicer.my_pretrained_method.SenseVoice.model import SenseVoiceSmall + + +OP_NAME = 'video_audio_speech_emotion_mapper' + +with AvailabilityChecking(['torch', 'transformers', 'torchaudio'], OP_NAME): + import torch + torch.set_num_threads(1) + + +@OPERATORS.register_module(OP_NAME) +class VideoAudioSpeechEmotionMapper(Mapper): + """Mapper to generate video tags from audio streams extracted by video + using the Audio Spectrogram Transformer. + """ + + def __init__(self, + model_dir_emo='model_path/SenseVoiceSmall', + *args, + **kwargs): + """ + Initialization method. + + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self._batched_op = True + self._accelerator = 'cuda' + self._model_sampling_rate = 16000 + self.model_dir_emo = model_dir_emo + + self.model_key = prepare_model( + model_type='SenseVoiceSmall', + pretrained_model_name_or_path=self.model_dir_emo, + ) + + def process(self, sample, rank=None): + # check if it's generated already + if Fields.speech_emotion in sample: + return sample + + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + sample[Fields.video_audio_tags] = [] + return sample + + # load video paths + loaded_video_keys = sample[self.video_key][0] + audio_tags = sample[Fields.video_audio_tags][0] + + # model, feature_extractor = get_model(self.model_key, rank=rank) + video_audio_tags = [] + for id,video_path in enumerate(loaded_video_keys): + if audio_tags[id] == 'Speech': + # only extract audio data and sr for index 0 for now + ys, srs, valid_indexes = extract_audio_from_video( + video_path, stream_indexes=[0]) + if len(valid_indexes) == 0: + # there is no valid audio streams. Skip! + video_audio_tags.append(self._no_audio_label) + continue + + # inference + y = ys[0] + sr = srs[0] + # check if it meets the sampling rate condition of the model + if sr != self._model_sampling_rate: + y = librosa.resample(y, + orig_sr=sr, + target_sr=self._model_sampling_rate) + sr = self._model_sampling_rate + + ASR_Emo_model, kwargs1= get_model(self.model_key, rank=rank) + inputs = torch.tensor(y).to(next(ASR_Emo_model.parameters()).device) + with torch.no_grad(): + output_ASR_emo = ASR_Emo_model.inference( + data_in=inputs, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=False, + **kwargs1, + ) + + video_audio_tags.append(output_ASR_emo[0][0]['text'].split('<|',2)[-1].split('|>')[0]) + else: + video_audio_tags.append('') + + sample[Fields.speech_emotion] = video_audio_tags + gc.collect() + torch.cuda.empty_cache() + return sample diff --git a/data_juicer/ops/mapper/video_captioning_face_attribute_emotion_mapper.py b/data_juicer/ops/mapper/video_captioning_face_attribute_emotion_mapper.py new file mode 100644 index 0000000000..e7e5323b10 --- /dev/null +++ b/data_juicer/ops/mapper/video_captioning_face_attribute_emotion_mapper.py @@ -0,0 +1,123 @@ +import numpy as np +from data_juicer.utils.constant import Fields +from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.model_utils import get_model, prepare_model +from ..base_op import OPERATORS, Mapper +from ..op_fusion import LOADED_VIDEOS +from data_juicer.utils.ASD_mapper_utils import get_video_array_cv2, annotate_video_with_bounding_boxes, crop_from_array +import sys +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/ShareGPT4Video') +import gc + +OP_NAME = 'video_captioning_face_attribute_emotion_mapper' + +with AvailabilityChecking(['torch', 'transformers'], + OP_NAME): + + import torch, os, tempfile, shutil + from shutil import rmtree + import pickle, copy, cv2 + import transformers # noqa: F401 + + # avoid hanging when calling clip in multiprocessing + torch.set_num_threads(1) +import sys +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/VideoLLaMA2') +from videollama2 import mm_infer + + +@OPERATORS.register_module(OP_NAME) +@LOADED_VIDEOS.register_module(OP_NAME) +class VideoCaptioningFaceAttributeEmotionMapper(Mapper): + """Mapper to generate samples whose captions are generated based on + a video-to-text model and sampled video frame.""" + + def __init__( + self, + face_track_query: str = "Please describe the person's facial expression, tell me the person's emotion through the video, like Happiness, Excitement, Love, Gratitude, Relief, Pride, Anger, Sadness, Fear, Guilt, Shame, Disgust, Surprise, Confusion, Curiosity, Boredom ...", + cropping_face_video_tempt_path = './tempt_video/tmp_video_remove', + video_describe_model_path: str = 'pt_model/VideoLLaMA2', + *args, + **kwargs + ): + """ + Initialization method. + + :param hf_video_blip: video-blip model name on huggingface + to generate caption + """ + super().__init__(*args, **kwargs) + + self._batched_op = True + self._accelerator = 'cuda' + self.context_param = 0.8 + + # self.pre_query_prompt = "The provided image arranges keyframes from a video in a grid view, keyframes are separated with white bands. " + self.query = face_track_query + self.cropping_face_video_tempt_path = cropping_face_video_tempt_path + + self.model_key = prepare_model( + model_type='VideoLLaMA2', + pretrained_model_name_or_path=video_describe_model_path, + ) + + + + def process(self, samples, rank=None): + + Total_information = [] + video_samples = samples[Fields.human_track_data_path] + loaded_video_keys = samples[self.video_key][0] + + cropping_face_video_tempt_path = tempfile.mkdtemp(dir=self.cropping_face_video_tempt_path) + if os.path.exists(cropping_face_video_tempt_path): + rmtree(cropping_face_video_tempt_path) + + os.makedirs(cropping_face_video_tempt_path, exist_ok = False) + model, processor, tokenizer= get_model(self.model_key, rank=rank) + for vedio_id,ASD_attribute_all_tracks_for_one_video in enumerate(video_samples[0]): + if len(ASD_attribute_all_tracks_for_one_video) == 0: + Total_information.append([]) + continue + + description_for_each_track = [] + video_array = get_video_array_cv2(loaded_video_keys[vedio_id]) + for track_id,tracks_now in enumerate(ASD_attribute_all_tracks_for_one_video): + cs = self.context_param + + with open(tracks_now['bbox_path'], 'rb') as f: + bbox_data = pickle.load(f) + xys_bbox = bbox_data['xys_bbox'] + track_frame = bbox_data['frame'] + + face_video_out_path = os.path.join(cropping_face_video_tempt_path, loaded_video_keys[vedio_id].split('/')[-1][:-4] + '__' + str(track_id) + '.mp4') + vOut = cv2.VideoWriter(face_video_out_path, cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video + + start_frame_id_in = 0 + start_frame_id_out = track_frame[start_frame_id_in] # tag + while start_frame_id_in + 1 = minTrack: # Discard the shot frames less than minTrack frames + allTracks.extend(track_shot(faces[shot[0].frame_num:shot[1].frame_num])) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces + + # Get face and human tracks + for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)): + # 待优化! + result = get_face_and_human_tracks(video_array, track, human_detection_model) + if result: + vidTracks.append(result) + # merge + people_num_atleast, update_track = post_merge(vidTracks,video_array) + + for i in range(len(update_track)): + save_bbox_name = os.path.join(self.face_track_bbox_path, video_key.split("/")[-1][:-4] +'_'+str(i)+'.pkl') + xy_bbox = update_track[i]['track']['bbox'] + xys_bbox = update_track[i]['proc_track'] + xy_human_bbox = update_track[i]['human_bbox'] + frames = update_track[i]['track']['frame'] + bbox_dict = {'frame':frames, 'xy_bbox':xy_bbox, 'xys_bbox':xys_bbox, 'xy_human_bbox':xy_human_bbox} + f_save = open(save_bbox_name, 'wb') + pickle.dump(bbox_dict, f_save) + f_save.close() + del update_track[i]['human_bbox'] + del update_track[i]['proc_track'] + del update_track[i]['track'] + update_track[i]['bbox_path'] = save_bbox_name + + + Total_result.append(update_track) + min_people_in_video.append(people_num_atleast) + torch.cuda.empty_cache() + + sample[Fields.human_track_data_path] = Total_result + sample[Fields.min_people_in_video] = min_people_in_video + + gc.collect() + torch.cuda.empty_cache() + + return sample diff --git a/data_juicer/utils/ASD_mapper_utils.py b/data_juicer/utils/ASD_mapper_utils.py new file mode 100644 index 0000000000..ff4ae3a40a --- /dev/null +++ b/data_juicer/utils/ASD_mapper_utils.py @@ -0,0 +1,797 @@ +from scenedetect.video_manager import VideoManager +from scipy import signal +import os +import sys +import copy +import cv2 +import numpy as np +from scipy.interpolate import interp1d +from scipy.io import wavfile +import sys, os, tqdm, torch, subprocess, cv2, numpy, math, python_speech_features +from deepface import DeepFace +from data_juicer.my_pretrained_method.YOLOv8_human.dj import demo +sys.path.append('./data_juicer/my_pretrained_method/Light-ASD') + +def scene_detect(videoFilePath): + # CPU: Scene detection, output is the list of each shot's time duration + videoManager = VideoManager([videoFilePath]) + sceneList = [(videoManager.base_timecode, videoManager.duration)] + return sceneList + + +def inference_video(video_array, DET): + # from model.faceDetector.s3fd import S3FD + # GPU: Face detection, output is the list contains the face location and score in this frame + # DET = S3FD(device='cuda') + dets = [] + total_frame = video_array.shape[0] + for fidx in range(total_frame): + image = video_array[fidx] + imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[0.25]) + dets.append([]) + for bbox in bboxes: + dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) # dets has the frames info, bbox info, conf info + # sys.stderr.write('%s-%05d; %d dets\r' % (args.videoFilePath, fidx, len(dets[-1]))) + + return dets + +def get_video_array_cv2(videoFilePath): + cap = cv2.VideoCapture(videoFilePath) + if not cap.isOpened(): + print(f"Error: Cannot open video file {videoFilePath}") + return None + + frames = [] + while True: + ret, frame = cap.read() + if not ret: + break + frames.append(frame) + + cap.release() + frames_array = np.array(frames) + return frames_array + +def bb_intersection_over_union(boxA, boxB, evalCol = False): + # CPU: IOU Function to calculate overlap between two image + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + interArea = max(0, xB - xA) * max(0, yB - yA) + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + if evalCol == True: + iou = interArea / float(boxAArea) + else: + iou = interArea / float(boxAArea + boxBArea - interArea) + return iou + +import copy +def track_shot(sceneFaces, numFailedDet=8, minTrack=10): + # CPU: Face tracking + iouThres = 0.55 # Minimum IOU between consecutive face detections + tracks = [] + while True: + track = [] + for frameFaces in sceneFaces: + best_match = None + max_iou = 0 + frameFaces_ori = copy.deepcopy(frameFaces) + for face in frameFaces_ori: + if track == []: + track.append(face) + frameFaces.remove(face) + elif face['frame'] - track[-1]['frame'] <= numFailedDet and not face['frame'] == track[-1]['frame']: + iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) + + if iou > iouThres and iou > max_iou: + best_match = face + max_iou = iou + else: + break + + if best_match is not None: + track.append(best_match) + frameFaces.remove(best_match) + + if track == []: + break + elif len(track) > minTrack: + frameNum = np.array([ f['frame'] for f in track ]) + bboxes = np.array([np.array(f['bbox']) for f in track]) + frameI = np.arange(frameNum[0],frameNum[-1]+1) + bboxesI = [] + for ij in range(0,4): + interpfn = interp1d(frameNum, bboxes[:,ij]) + bboxesI.append(interpfn(frameI)) + bboxesI = np.stack(bboxesI, axis=1) + if max(np.mean(bboxesI[:,2]-bboxesI[:,0]), np.mean(bboxesI[:,3]-bboxesI[:,1])) > 1: + tracks.append({'frame':frameI,'bbox':bboxesI}) + return tracks + + +def find_human_bounding_box(face_bbox, human_bboxes): + head_x1, head_y1, head_x2, head_y2 = face_bbox + head_center_x = (head_x1 + head_x2)/2 + + candidate_bboxes = [] + + for human_bbox in human_bboxes: + human_x1, human_y1, human_x2, human_y2 = human_bbox + + if (human_x1 <= head_x1 and head_x2 <= human_x2) and (human_y1 <= head_y1 and head_y2 <= human_y2): + candidate_bboxes.append(human_bbox) + + if not candidate_bboxes: + return () + + # Select the human body bounding box with the smallest distance between (x1 + x2) / 2 and (x1 + x2) / 2 of face_bbox + closest_bbox = min(candidate_bboxes, key=lambda bbox: (((bbox[0] + bbox[2]) / 2) - head_center_x)**2 + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])) + + return closest_bbox + +def update_negative_ones(values): + n = len(values) + i = 0 + + while i < n: + if values[i] == -1: + # Find the nearest number on the left + left_index = i - 1 + while left_index >= 0 and values[left_index] == -1: + left_index -= 1 + + # Find the nearest number on the right + right_index = i + 1 + while right_index < n and values[right_index] == -1: + right_index += 1 + + # Update the value of -1 + if left_index >= 0 and right_index < n: + left_value = values[left_index] + right_value = values[right_index] + values[i] = (left_value + right_value) / 2 + elif left_index >= 0: + values[i] = values[left_index] + elif right_index < n: + values[i] = values[right_index] + else: + raise ValueError("Unable to find valid values ​​on both the left and right to update -1 at index {i}") + i += 1 + + return values + + +def detect_and_mark_anomalies(data, window_size=7, std_multiplier=2): + data = np.array(data) + result = data.copy() + + for i in range(len(data)): + if data[i] > 0: + start = max(0, i - window_size) + end = min(len(data), i + window_size + 1) + neighbors = data[start:end] + + neighbors = np.delete(neighbors, np.where(neighbors == data[i])) + + positive_neighbors = neighbors[neighbors > 0] + + if len(positive_neighbors) < 2: + continue + + mean = np.mean(positive_neighbors) + std = np.std(positive_neighbors) + + if abs(data[i] - mean) > std * std_multiplier: + result[i] = -1 + + return result + + +def get_face_and_human_tracks(video_array, track, human_detection_pipeline): + dets = {'x':[], 'y':[], 's':[]} + for det in track['bbox']: # Read the tracks + dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x + dets['x'].append((det[0]+det[2])/2) # crop center y + + # human_bounding_box + human_bbox = {'x1':[], 'y1':[], 'x2':[], 'y2':[]} + for in_id,out_track_id in enumerate(track['frame']): # Read the tracks + frame_ = video_array[out_track_id] + head_x1, head_y1, head_x2, head_y2 = track['bbox'][in_id] + human_bbox_list = demo(frame_, human_detection_pipeline) + result = find_human_bounding_box((head_x1, head_y1, head_x2, head_y2), human_bbox_list) + if result == (): + human_bbox['x1'].append(-1) + human_bbox['y1'].append(-1) + human_bbox['x2'].append(-1) + human_bbox['y2'].append(-1) + else: + human_bbox['x1'].append(result[0]) + human_bbox['y1'].append(result[1]) + human_bbox['x2'].append(result[2]) + human_bbox['y2'].append(result[3]) + if (np.array(human_bbox['x1'])<0).sum() > 0: + if all(element < 0 for element in human_bbox['x1']): + return False + human_bbox['x1'] = detect_and_mark_anomalies(human_bbox['x1'], window_size=30, std_multiplier=10) + human_bbox['x1'] = update_negative_ones(human_bbox['x1']) + if (np.array(human_bbox['y1'])<0).sum() > 0: + human_bbox['y1'] = detect_and_mark_anomalies(human_bbox['y1'], window_size=30, std_multiplier=10) + human_bbox['y1'] = update_negative_ones(human_bbox['y1']) + if (np.array(human_bbox['x2'])<0).sum() > 0: + human_bbox['x2'] = detect_and_mark_anomalies(human_bbox['x2'], window_size=30, std_multiplier=10) + human_bbox['x2'] = update_negative_ones(human_bbox['x2']) + if (np.array(human_bbox['y2'])<0).sum() > 0: + human_bbox['y2'] = detect_and_mark_anomalies(human_bbox['y2'], window_size=30, std_multiplier=10) + human_bbox['y2'] = update_negative_ones(human_bbox['y2']) + human_bbox['x1'] = signal.medfilt(human_bbox['x1'], kernel_size=5).tolist() + human_bbox['y1'] = signal.medfilt(human_bbox['y1'], kernel_size=5).tolist() + human_bbox['x2'] = signal.medfilt(human_bbox['x2'], kernel_size=5).tolist() + human_bbox['y2'] = signal.medfilt(human_bbox['y2'], kernel_size=5).tolist() + + return {'track':track, 'proc_track':dets, 'human_bbox':human_bbox} + +def crop_video_with_facetrack(video_array, track, cropFile, audioFilePath,is_empty=False): + if is_empty: + return True + + dets = track['xys_bbox'] + # CPU: crop the face clips + vOut = cv2.VideoWriter(cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video + + for fidx, frame in enumerate(track['frame']): + cs = 0.4 + bs = dets['s'][fidx] # Detection box size + bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount + image = video_array[frame] + frame = numpy.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110)) + my = dets['y'][fidx] + bsi # BBox center Y + mx = dets['x'][fidx] + bsi # BBox center X + face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + vOut.write(cv2.resize(face, (224, 224))) + audioTmp = cropFile + '.wav' + audioStart = (track['frame'][0]) / 25 + audioEnd = (track['frame'][-1]+1) / 25 + vOut.release() + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % \ + (audioFilePath, 10, audioStart, audioEnd, audioTmp)) + output = subprocess.call(command, shell=True, stdout=None) # Crop audio file + _, audio = wavfile.read(audioTmp) + command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % \ + (cropFile, audioTmp, 10, cropFile)) # Combine audio and video file + output = subprocess.call(command, shell=True, stdout=None) + os.remove(cropFile + 't.avi') + return True + + + +def crop_video(video_array, track, cropFile, audioFilePath, human_detection_pipeline,is_empty=False): + dets = {'x':[], 'y':[], 's':[]} + for det in track['bbox']: # Read the tracks + dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x + dets['x'].append((det[0]+det[2])/2) # crop center y + + # human_bounding_box + human_bbox = {'x1':[], 'y1':[], 'x2':[], 'y2':[]} + for in_id,out_track_id in enumerate(track['frame']): # Read the tracks + frame_ = video_array[out_track_id] + head_x1, head_y1, head_x2, head_y2 = track['bbox'][in_id] + human_bbox_list = demo(frame_, human_detection_pipeline) + result = find_human_bounding_box((head_x1, head_y1, head_x2, head_y2), human_bbox_list) + if result == (): + human_bbox['x1'].append(-1) + human_bbox['y1'].append(-1) + human_bbox['x2'].append(-1) + human_bbox['y2'].append(-1) + else: + human_bbox['x1'].append(result[0]) + human_bbox['y1'].append(result[1]) + human_bbox['x2'].append(result[2]) + human_bbox['y2'].append(result[3]) + if (np.array(human_bbox['x1'])<0).sum() > 0: + if all(element < 0 for element in human_bbox['x1']): + return False + human_bbox['x1'] = update_negative_ones(human_bbox['x1']) + if (np.array(human_bbox['y1'])<0).sum() > 0: + human_bbox['y1'] = update_negative_ones(human_bbox['y1']) + if (np.array(human_bbox['x2'])<0).sum() > 0: + human_bbox['x2'] = update_negative_ones(human_bbox['x2']) + if (np.array(human_bbox['y2'])<0).sum() > 0: + human_bbox['y2'] = update_negative_ones(human_bbox['y2']) + human_bbox['x1'] = signal.medfilt(human_bbox['x1'], kernel_size=5).tolist() + human_bbox['y1'] = signal.medfilt(human_bbox['y1'], kernel_size=5).tolist() + human_bbox['x2'] = signal.medfilt(human_bbox['x2'], kernel_size=5).tolist() + human_bbox['y2'] = signal.medfilt(human_bbox['y2'], kernel_size=5).tolist() + + if is_empty: + return {'track':track, 'proc_track':dets, 'human_bbox':human_bbox} + + # CPU: crop the face clips + vOut = cv2.VideoWriter(cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video + + for fidx, frame in enumerate(track['frame']): + cs = 0.4 + bs = dets['s'][fidx] # Detection box size + bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount + image = video_array[frame] + frame = numpy.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110)) + my = dets['y'][fidx] + bsi # BBox center Y + mx = dets['x'][fidx] + bsi # BBox center X + face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + vOut.write(cv2.resize(face, (224, 224))) + audioTmp = cropFile + '.wav' + audioStart = (track['frame'][0]) / 25 + audioEnd = (track['frame'][-1]+1) / 25 + vOut.release() + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % \ + (audioFilePath, 10, audioStart, audioEnd, audioTmp)) + output = subprocess.call(command, shell=True, stdout=None) # Crop audio file + _, audio = wavfile.read(audioTmp) + command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % \ + (cropFile, audioTmp, 10, cropFile)) # Combine audio and video file + output = subprocess.call(command, shell=True, stdout=None) + os.remove(cropFile + 't.avi') + return {'track':track, 'proc_track':dets, 'human_bbox':human_bbox} + + +def evaluate_network(files, s, pycropPath): + # GPU: active speaker detection by pretrained model + allScores = [] + # durationSet = {1,2,4,6} # To make the result more reliable + durationSet = {1,1,1,2,2,2,3,3,4,5,6} # Use this line can get more reliable result + for file in tqdm.tqdm(files, total = len(files)): + fileName = os.path.splitext(file.split('/')[-1])[0] # Load audio and video + _, audio = wavfile.read(os.path.join(pycropPath, fileName + '.wav')) + if len(audio) == 0: + scores = numpy.array([-5]) + allScores.append(allScore) + continue + + audioFeature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010) + + video = cv2.VideoCapture(os.path.join(pycropPath, fileName + '.avi')) + videoFeature = [] + while video.isOpened(): + ret, frames = video.read() + if ret == True: + face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (224,224)) + face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))] + videoFeature.append(face) + else: + break + video.release() + videoFeature = np.array(videoFeature) + length = min((audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0]) + audioFeature = audioFeature[:int(round(length * 100)),:] + videoFeature = videoFeature[:int(round(length * 25)),:,:] + allScore = [] # Evaluation use model + for duration in durationSet: + batchSize = int(math.ceil(length / duration)) + scores = [] + with torch.no_grad(): + for i in range(batchSize): + inputA = torch.FloatTensor(audioFeature[i * duration * 100:(i+1) * duration * 100,:]).unsqueeze(0).to(next(s.parameters()).device) + inputV = torch.FloatTensor(videoFeature[i * duration * 25: (i+1) * duration * 25,:,:]).unsqueeze(0).to(next(s.parameters()).device) + embedA = s.model.forward_audio_frontend(inputA) + embedV = s.model.forward_visual_frontend(inputV) + out = s.model.forward_audio_visual_backend(embedA, embedV) + score = s.lossAV.forward(out, labels = None) + scores.extend(score) + del inputA + del inputV + del embedA + del embedV + allScore.append(scores) + allScore = numpy.round((numpy.mean(numpy.array(allScore), axis = 0)), 1).astype(float) + allScores.append(allScore) + return allScores + + +def visualization(tracks, scores, video_array, pyaviPath): + # CPU: visulize the result for video format + + faces = [[] for i in range(video_array.shape[0])] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track['track']['frame'].tolist()): + s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)] # average smoothing + s = numpy.mean(s) + faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) + firstImage = video_array[0] + fw = firstImage.shape[1] + fh = firstImage.shape[0] + vOut = cv2.VideoWriter(os.path.join(pyaviPath, 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw,fh)) + colorDict = {0: 0, 1: 255} + for fidx in tqdm.tqdm(range(video_array.shape[0])): + image = video_array[fidx] + for face in faces[fidx]: + clr = colorDict[int((face['score'] >= 0))] + txt = round(face['score'], 1) + cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int(face['x']+face['s']), int(face['y']+face['s'])),(0,clr,255-clr),10) + cv2.putText(image,'%s'%(txt), (int(face['x']-face['s']), int(face['y']-face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,clr,255-clr),5) + vOut.write(image) + vOut.release() + command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % \ + (os.path.join(pyaviPath, 'video_only.avi'), os.path.join(pyaviPath, 'audio.wav'), \ + 10, os.path.join(pyaviPath,'video_out.avi'))) + output = subprocess.call(command, shell=True, stdout=None) + +def calculate_good_matches(matches, ratio=0.75): + good_matches = [] + for m, n in matches: + if m.distance < ratio * n.distance: + good_matches.append(m) + return len(good_matches) + +def find_max_intersection_and_remaining_dicts(dicts): + if not dicts: + return [], [] + + track_frames = [d['track']['frame'] for d in dicts] + + all_elements = set() + for frame in track_frames: + all_elements.update(frame) + + max_combination_indices = [] + max_intersection = set() + + for elem in all_elements: + current_combination_indices = [] + current_intersection = set([elem]) + + for i, frame in enumerate(track_frames): + if elem in frame: + current_combination_indices.append(i) + current_intersection.intersection_update(frame) + + if len(current_combination_indices) > len(max_combination_indices): + max_combination_indices = current_combination_indices + max_intersection = current_intersection + + max_combination = [dicts[i] for i in max_combination_indices] + remaining_dicts = [d for i, d in enumerate(dicts) if i not in max_combination_indices] + + return max_combination, remaining_dicts + +def get_faces_array(frame,s,x,y): + cs = 0.4 + bs = s # Detection box size + bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount + image = frame + frame = np.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110)) + my = y + bsi # BBox center Y + mx = x + bsi # BBox center X + face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + return face + + +def order_track_distance(track1,track2,video_array): + # Get the last face frame of track1 and the first face frame of track2 + track1_end_frame = video_array[track1['track']['frame'][-1]] + track1_s = track1['proc_track']['s'][-1] + track1_x = track1['proc_track']['x'][-1] + track1_y = track1['proc_track']['y'][-1] + track1_end_face_array = get_faces_array(track1_end_frame,track1_s,track1_x,track1_y) + + track2_start_frame = video_array[track2['track']['frame'][0]] + track2_s = track2['proc_track']['s'][0] + track2_x = track2['proc_track']['x'][0] + track2_y = track2['proc_track']['y'][0] + track2_strat_face_array = get_faces_array(track2_start_frame,track2_s,track2_x,track2_y) + + # Calculate the area overlap ratio + track1_bbox = track1['track']['bbox'][-1] + track2_bbox = track2['track']['bbox'][0] + iou = bb_intersection_over_union(track1_bbox, track2_bbox) + if iou <= 0.2: + distance_iou = 10000 + else: + distance_iou = math.exp(-5*iou) + + normalized_distance = 0 + + # face_id distance (with facenet) + result = DeepFace.verify(track1_end_face_array, track2_strat_face_array, model_name='Facenet', detector_backend = 'skip') + facenet_distance = result['distance'] + if facenet_distance > 0.85: + facenet_distance = facenet_distance + 10000 + + distance = 2*distance_iou + normalized_distance + facenet_distance + + return distance + +def update_remain(remaining_dicts, pop_item): + updated_dicts = [item for item in remaining_dicts if item['track']['bbox'].shape != pop_item['track']['bbox'].shape or (item['track']['bbox'] != pop_item['track']['bbox']).any()] + return updated_dicts + +def order_merge_tracks(track1,track2): + new_track = {} + new_track['proc_track'] = {} + new_track['proc_track']['x'] = track1['proc_track']['x'] + track2['proc_track']['x'] + new_track['proc_track']['y'] = track1['proc_track']['y'] + track2['proc_track']['y'] + new_track['proc_track']['s'] = track1['proc_track']['s'] + track2['proc_track']['s'] + new_track['human_bbox'] = {} + new_track['human_bbox']['x1'] = track1['human_bbox']['x1'] + track2['human_bbox']['x1'] + new_track['human_bbox']['y1'] = track1['human_bbox']['y1'] + track2['human_bbox']['y1'] + new_track['human_bbox']['x2'] = track1['human_bbox']['x2'] + track2['human_bbox']['x2'] + new_track['human_bbox']['y2'] = track1['human_bbox']['y2'] + track2['human_bbox']['y2'] + + new_track['track'] = {} + for key in list(track1['track'].keys()): + object1 = track1['track'][key] + object2 = track2['track'][key] + if isinstance(object1, np.ndarray): + new_track['track'][key] = np.concatenate((object1, object2)) + elif isinstance(object1, list): + new_track['track'][key] = object1 + object2 + else: + raise('new data type') + + return new_track + +def post_merge(vidTracks,video_array): + # Find the maximum overlapping tracks as the initial anchor + anchor_combination, remaining_dicts = find_max_intersection_and_remaining_dicts(vidTracks) + end_frame = video_array.shape[0] + continue_flag = np.ones((len(anchor_combination),2)) + max_iteration = 10 + iteration_count = 0 + while iteration_count0: + for track_ind in range(len(anchor_combination)): + track = anchor_combination[track_ind] + # Try to extend forward + if continue_flag[track_ind][0]: + if track['track']['frame'][0] == 0: + continue_flag[track_ind][0] = 0 + else: + # Find the candidate that is connected to it and is in the front row + possible_prior_tracks = [] + for checktrack in remaining_dicts: + if checktrack['track']['frame'][-1]+1 == track['track']['frame'][0] or checktrack['track']['frame'][-1]+2 == track['track']['frame'][0]: + possible_prior_tracks.append(checktrack) + # If it is not zero, then check the calculated distance + if len(possible_prior_tracks)>0: + distance_score_list = [] + for possible_prior_track in possible_prior_tracks: + distance_score_list.append(order_track_distance(possible_prior_track, track, video_array)) + distance_score_array = np.array(distance_score_list) + if min(distance_score_array) < 10000: + min_index = np.argmin(distance_score_array) + new_anchor = order_merge_tracks(possible_prior_tracks[min_index], track) + # update_anchor() + anchor_combination[track_ind] = new_anchor + track = new_anchor + remaining_dicts = update_remain(remaining_dicts, possible_prior_tracks[min_index]) + else: + continue_flag[track_ind][0] = 0 + else: + continue_flag[track_ind][0] = 0 + # Try to extend backwards + if continue_flag[track_ind][1]: + if track['track']['frame'][-1] == end_frame: + continue_flag[track_ind][0] = 0 + else: + # Find the candidate that is connected to it and in front of it + possible_after_tracks = [] + for checktrack in remaining_dicts: + if checktrack['track']['frame'][0]-1 == track['track']['frame'][-1] or checktrack['track']['frame'][0]-2 == track['track']['frame'][-1]: + possible_after_tracks.append(checktrack) + # If it is not zero, then check the calculated distance + if len(possible_after_tracks)>0: + distance_score_list = [] + for possible_after_track in possible_after_tracks: + distance_score_list.append(order_track_distance(track, possible_after_track, video_array)) + distance_score_array = np.array(distance_score_list) + if min(distance_score_array) < 10000: + min_index = np.argmin(distance_score_array) + new_anchor = order_merge_tracks(track, possible_after_tracks[min_index]) + # update_anchor() + anchor_combination[track_ind] = new_anchor + remaining_dicts = update_remain(remaining_dicts, possible_after_tracks[min_index]) + else: + continue_flag[track_ind][1] = 0 + else: + continue_flag[track_ind][1] = 0 + + final_tracks = anchor_combination + remaining_dicts + if len(final_tracks) > 5: + sorted_tracks = sorted(final_tracks, key=lambda x: len(x['track']['frame']), reverse=True) + top_tracks = sorted_tracks[:5] + else: + top_tracks = final_tracks + # return len(anchor_combination), top_5_tracks + returntracks = [] + for item in top_tracks: + if len(item['track']['frame'])>15: + returntracks.append(item) + return len(anchor_combination), returntracks + + +def longest_continuous_actives(arr): + max_length = 0 + current_length = 0 + + for num in arr: + if num > 0: + current_length += 1 + if current_length > max_length: + max_length = current_length + else: + current_length = 0 + + return max_length + +import pickle +import moviepy.editor as mp + +def annotate_video_with_bounding_boxes_with_audio(video_path, q_human_video_track_bbox, output_path): + bbox_path = q_human_video_track_bbox['bbox_path'] + frame_indices = q_human_video_track_bbox['track']['frame'] + video_array = get_video_array_cv2(video_path) + + with open(bbox_path, 'rb') as f: + bbox_data = pickle.load(f) + xy_bbox = bbox_data['xy_bbox'] + + # Get video dimensions and frame rate + cap = cv2.VideoCapture(video_path) + fps = cap.get(cv2.CAP_PROP_FPS) # Get original video frame rate + num_frames, height, width, channels = video_array.shape + assert channels == 3, "Input video must have 3 channels (BGR)." + + # Initialize video writer + fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 + temp_video_path = output_path.split('.')[0] + 'temp.mp4' + out = cv2.VideoWriter(temp_video_path, fourcc, fps, (width, height)) # Use original FPS + + # Annotate video frames with bounding boxes + for i in range(num_frames): + frame = video_array[i] + if i in frame_indices: + idx = frame_indices.index(i) + x1, y1, x2, y2 = xy_bbox[idx] + # Draw bounding box + thickness = max(int((x2 - x1) / 40), 2) + cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), thickness) + + # Write frame to temporary video + out.write(frame) + + out.release() + cap.release() # Release the video capture object + + # Load original video and audio + original_video = mp.VideoFileClip(video_path) + annotated_video = mp.VideoFileClip(temp_video_path) + + # Combine annotated video with original audio, ensuring alignment + final_video = annotated_video.set_audio(original_video.audio) + + # Write the final output video with audio + final_video.write_videofile(output_path, codec='libx264', audio_codec='aac', fps=fps) + + # Clean up temporary video file + annotated_video.close() + original_video.close() + + # Optionally, remove the temporary video file + import os + if os.path.exists(temp_video_path): + os.remove(temp_video_path) + + return output_path + +def annotate_video_with_bounding_boxes_withText_with_audio(video_path, q_human_video_track_bbox, output_path, numbers): + bbox_path = q_human_video_track_bbox['bbox_path'] + frame_indices = q_human_video_track_bbox['track']['frame'] + video_array = get_video_array_cv2(video_path) + + with open(bbox_path, 'rb') as f: + bbox_data = pickle.load(f) + xy_bbox = bbox_data['xy_bbox'] + + # Get video dimensions and frame rate + cap = cv2.VideoCapture(video_path) + fps = cap.get(cv2.CAP_PROP_FPS) # Get original video frame rate + num_frames, height, width, channels = video_array.shape + assert channels == 3, "Input video must have 3 channels (BGR)." + + # Initialize video writer + fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 + temp_video_path = output_path.split('.')[0] + 'temp.mp4' + out = cv2.VideoWriter(temp_video_path, fourcc, fps, (width, height)) # Use original FPS + + # Annotate video frames with bounding boxes + for i in range(num_frames): + frame = video_array[i] + if i in frame_indices: + idx = frame_indices.index(i) + x1, y1, x2, y2 = xy_bbox[idx] + # Draw bounding box + thickness = max(int((x2 - x1) / 40), 2) + cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), thickness) + # Put the number in the top-left corner of the bounding box + cv2.putText(frame, numbers, (int(x1) + 10, int(y1) + 35), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 255), 3) + + # Write frame to temporary video + out.write(frame) + + out.release() + cap.release() # Release the video capture object + + # Load original video and audio + original_video = mp.VideoFileClip(video_path) + annotated_video = mp.VideoFileClip(temp_video_path) + + # Combine annotated video with original audio, ensuring alignment + final_video = annotated_video.set_audio(original_video.audio) + + # Write the final output video with audio + final_video.write_videofile(output_path, codec='libx264', audio_codec='aac', fps=fps) + + # Clean up temporary video file + annotated_video.close() + original_video.close() + + # Optionally, remove the temporary video file + import os + if os.path.exists(temp_video_path): + os.remove(temp_video_path) + + return output_path + + +def annotate_video_with_bounding_boxes(video_array, frame_indices, bounding_boxes, output_path): + """ + Annotates specified frames in the video with bounding boxes and saves the result to a new video file. + + :param video_array: Input video as a numpy array with shape (num_frames, height, width, channels). + :param frame_indices: List of frame indices to annotate. + :param bounding_boxes: Array of bounding box coordinates with shape (num_frames_to_annotate, 4), where each bounding box is (x, y, w, h). + :param output_path: Path to save the output video. + """ + # Get video dimensions + num_frames, height, width, channels = video_array.shape + assert channels == 3, "Input video must have 3 channels (BGR)." + + # Initialize video writer + fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 + out = cv2.VideoWriter(output_path, fourcc, 30.0, (width, height)) + + # option 1: keep all video + for i in range(num_frames): + frame = video_array[i] + if i in frame_indices: + idx = frame_indices.index(i) + x1, y1, x2, y2 = bounding_boxes[idx] + # Draw bounding box + thinkness = max(int((x2-x1)/40),2) + cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), thinkness) + + # Write frame to output video + out.write(frame) + + # option 2:crap + # for in_id, out_id in enumerate(frame_indices): + # frame = video_array[out_id] + # x1, y1, x2, y2 = bounding_boxes[in_id] + # # Draw bounding box + # cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 5) + # # Write frame to output video + # out.write(frame) + + out.release() + return output_path + + +def crop_from_array(frame_before_crop, coords): + x1, y1, x2, y2 = coords + cropped_frame = frame_before_crop[y1:y2, x1:x2] + return cropped_frame \ No newline at end of file diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py index 242ad3afe5..5b12ef0dd8 100644 --- a/data_juicer/utils/constant.py +++ b/data_juicer/utils/constant.py @@ -27,6 +27,16 @@ class Fields(object): multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__' + active_speaker_flag = DEFAULT_PREFIX + 'active_speaker_flag_' + audio_speech_attribute = DEFAULT_PREFIX + 'audio_speech_attribute_' + speech_ASR = DEFAULT_PREFIX + 'speech_ASR_' + speech_emotion = DEFAULT_PREFIX + 'speech_emotion_' + video_facetrack_attribute_emotion = DEFAULT_PREFIX + 'video_facetrack_attribute_emotion_' + track_video_caption = DEFAULT_PREFIX + 'track_video_caption_' + video_track_is_child = DEFAULT_PREFIX + 'video_track_is_child_' + human_track_data_path = DEFAULT_PREFIX + 'human_track_data_path_' + min_people_in_video = DEFAULT_PREFIX + 'min_people_in_video_' + class BatchMetaKeys(object): entity_attribute = 'entity_attribute' most_relavant_entities = 'most_relavant_entities' @@ -250,6 +260,10 @@ class StatsKeysConstant(object): # video-text video_frames_text_similarity = 'video_frames_text_similarity' + # video-face-ratio + video_face_exist = 'video_face_exist' + + class StatsKeys(object, metaclass=StatsKeysMeta): _constants_class = StatsKeysConstant diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index dd99032e36..6e6c0da703 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -751,6 +751,85 @@ def prepare_vllm_model(pretrained_model_name_or_path, **model_params): return (model, tokenizer) +def prepare_SenseVoiceSmall_model(pretrained_model_name_or_path): + """ + Prepare and load light sharegpt4video. + + :param model_name: input model name. + """ + from data_juicer.my_pretrained_method.SenseVoice.model import SenseVoiceSmall + + logger.info('Loading ASR_Emo_model model...') + ASR_Emo_model, kwargs1 = SenseVoiceSmall.from_pretrained(model=pretrained_model_name_or_path) + + ASR_Emo_model.eval() + return ASR_Emo_model, kwargs1 + +def prepare_light_asd_model( + pretrained_model_name_or_path='weight/finetuning_TalkSet.model'): + """ + Prepare and load light asd model. + + :param model_name: input model name. + """ + logger.info('Loading light_asd model...') + from ASD import ASD + model = ASD() + model.loadParameters(pretrained_model_name_or_path) + model.eval() + return model + +from transformers import AutoModelForCausalLM, AutoTokenizer + +sys.path.append('./data_juicer/my_pretrained_method/VideoLLaMA2') +from videollama2 import model_init +def prepare_VideoLLaMA2_model(pretrained_model_name_or_path): + model, processor, tokenizer = model_init(pretrained_model_name_or_path, device_map="cpu") + model.eval() + return model, processor, tokenizer + +def prepare_sharegpt4video_model(pretrained_model_name_or_path): + """ + Prepare and load light sharegpt4video. + + :param model_name: input model name. + """ + import sys + sys.path.append('./data_juicer/my_pretrained_method/ShareGPT4Video') + from llava.mm_utils import get_model_name_from_path + from llava.model.builder import load_pretrained_model + + logger.info('Loading sharegpt4video model...') + model_name = get_model_name_from_path(pretrained_model_name_or_path) + tokenizer, model, processor, context_len = load_pretrained_model( + pretrained_model_name_or_path, None, model_name, device_map='cpu') + + model.eval() + return tokenizer, model, processor + +def prepare_YOLOv8_human_model( + pretrained_model_name_or_path='/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human/weights/best.pt'): + """ + Prepare and load light YOLOv8_human. + + :param model_name: input model name. + """ + logger.info('Loading YOLOv8_human model...') + human_detection_model = torch.load(pretrained_model_name_or_path)['model'].float() + human_detection_model.half() + human_detection_model.eval() + return human_detection_model + +def prepare_face_detect_S3FD_model(): + """ + Prepare and load light asd model. + + :param model_name: input model name. + """ + logger.info('Loading face_detect_S3FD_model model...') + from model.faceDetector.s3fd import S3FD + model = S3FD() + def update_sampling_params(sampling_params, pretrained_model_name_or_path, @@ -817,6 +896,12 @@ def update_sampling_params(sampling_params, 'spacy': prepare_spacy_model, 'video_blip': prepare_video_blip_model, 'vllm': prepare_vllm_model, + 'Light_ASD': prepare_light_asd_model, + 'SenseVoiceSmall': prepare_SenseVoiceSmall_model, + 'VideoLLaMA2': prepare_VideoLLaMA2_model, + 'sharegpt4video': prepare_sharegpt4video_model, + 'YOLOv8_human': prepare_YOLOv8_human_model, + 'face_detect_S3FD': prepare_face_detect_S3FD_model } _MODELS_WITHOUT_FILE_LOCK = { diff --git a/my_pretrained_method/Audiomodel.py b/my_pretrained_method/Audiomodel.py new file mode 100644 index 0000000000..a1c779e112 --- /dev/null +++ b/my_pretrained_method/Audiomodel.py @@ -0,0 +1,17 @@ +import sys +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/SenseVoice') +from model import SenseVoiceSmall + +model_dir = "/mnt1/daoyuan_mm/SenseVoiceSmall" +m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir) + + +res = m.inference( + data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", + language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=False, + **kwargs, +) + +print(res) + diff --git a/my_pretrained_method/Light-ASD/ASD.py b/my_pretrained_method/Light-ASD/ASD.py new file mode 100644 index 0000000000..4706069b3b --- /dev/null +++ b/my_pretrained_method/Light-ASD/ASD.py @@ -0,0 +1,102 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys, time, numpy, os, subprocess, pandas, tqdm +from subprocess import PIPE + +from loss import lossAV, lossV +from model.Model import ASD_Model +import os +os.chdir('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/Light-ASD') + +class ASD(nn.Module): + def __init__(self, lr = 0.001, lrDecay = 0.95, **kwargs): + super(ASD, self).__init__() + self.model = ASD_Model().cuda() + self.lossAV = lossAV().cuda() + self.lossV = lossV().cuda() + self.optim = torch.optim.Adam(self.parameters(), lr = lr) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = 1, gamma=lrDecay) + print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.model.parameters()) / 1000 / 1000)) + + def train_network(self, loader, epoch, **kwargs): + self.train() + self.scheduler.step(epoch - 1) # StepLR + index, top1, lossV, lossAV, loss = 0, 0, 0, 0, 0 + lr = self.optim.param_groups[0]['lr'] + r = 1.3 - 0.02 * (epoch - 1) + for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1): + self.zero_grad() + + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + + outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + outsV = self.model.forward_visual_backend(visualEmbed) + + labels = labels[0].reshape((-1)).cuda() # Loss + nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, r) + nlossV = self.lossV.forward(outsV, labels, r) + nloss = nlossAV + 0.5 * nlossV + + lossV += nlossV.detach().cpu().numpy() + lossAV += nlossAV.detach().cpu().numpy() + loss += nloss.detach().cpu().numpy() + top1 += prec + nloss.backward() + self.optim.step() + index += len(labels) + sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \ + " [%2d] r: %2f, Lr: %5f, Training: %.2f%%, " %(epoch, r, lr, 100 * (num / loader.__len__())) + \ + " LossV: %.5f, LossAV: %.5f, Loss: %.5f, ACC: %2.2f%% \r" %(lossV/(num), lossAV/(num), loss/(num), 100 * (top1/index))) + sys.stderr.flush() + + sys.stdout.write("\n") + + return loss/num, lr + + def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs): + self.eval() + predScores = [] + for audioFeature, visualFeature, labels in tqdm.tqdm(loader): + with torch.no_grad(): + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + labels = labels[0].reshape((-1)).cuda() + _, predScore, _, _ = self.lossAV.forward(outsAV, labels) + predScore = predScore[:,1].detach().cpu().numpy() + predScores.extend(predScore) + # break + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series( ['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1,inplace=True) + evalRes.drop(['instance_id'], axis=1,inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s "%(evalOrig, evalCsvSave) + mAP = float(str(subprocess.run(cmd, shell=True, stdout=PIPE, stderr=PIPE).stdout).split(' ')[2][:5]) + return mAP + + def saveParameters(self, path): + torch.save(self.state_dict(), path) + + def loadParameters(self, path): + selfState = self.state_dict() + loadedState = torch.load(path) + for name, param in loadedState.items(): + origName = name; + if name not in selfState: + name = name.replace("module.", "") + if name not in selfState: + print("%s is not in the model."%origName) + continue + if selfState[name].size() != loadedState[origName].size(): + sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), loadedState[origName].size())) + continue + selfState[name].copy_(param) \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/Columbia_test.py b/my_pretrained_method/Light-ASD/Columbia_test.py new file mode 100644 index 0000000000..2c71b9794b --- /dev/null +++ b/my_pretrained_method/Light-ASD/Columbia_test.py @@ -0,0 +1,456 @@ +import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features + +from scipy import signal +from shutil import rmtree +from scipy.io import wavfile +from scipy.interpolate import interp1d +from sklearn.metrics import accuracy_score, f1_score + +from scenedetect.video_manager import VideoManager +from scenedetect.scene_manager import SceneManager +from scenedetect.frame_timecode import FrameTimecode +from scenedetect.stats_manager import StatsManager +from scenedetect.detectors import ContentDetector + +from model.faceDetector.s3fd import S3FD +from ASD import ASD + +import os +os.chdir('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/Light-ASD') + +warnings.filterwarnings("ignore") + +parser = argparse.ArgumentParser(description = "Columbia ASD Evaluation") + +parser.add_argument('--videoName', type=str, default="0001", help='Demo video name') +parser.add_argument('--videoFolder', type=str, default="/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/Light-ASD/demo", help='Path for inputs, tmps and outputs') +parser.add_argument('--pretrainModel', type=str, default="weight/pretrain_AVA_CVPR.model", help='Path for the pretrained model') + +parser.add_argument('--nDataLoaderThread', type=int, default=10, help='Number of workers') +parser.add_argument('--facedetScale', type=float, default=0.25, help='Scale factor for face detection, the frames will be scale to 0.25 orig') +parser.add_argument('--minTrack', type=int, default=10, help='Number of min frames for each shot') +parser.add_argument('--numFailedDet', type=int, default=10, help='Number of missed detections allowed before tracking is stopped') +parser.add_argument('--minFaceSize', type=int, default=1, help='Minimum face size in pixels') +parser.add_argument('--cropScale', type=float, default=0.40, help='Scale bounding box') + +parser.add_argument('--start', type=int, default=0, help='The start time of the video') +parser.add_argument('--duration', type=int, default=0, help='The duration of the video, when set as 0, will extract the whole video') + +parser.add_argument('--evalCol', dest='evalCol', action='store_true', help='Evaluate on Columbia dataset') +parser.add_argument('--colSavePath', type=str, default="/colDataPath", help='Path for inputs, tmps and outputs') + +args = parser.parse_args() + + +if args.evalCol == True: + # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using) + # 2. extract audio, extract video frames + # 3. scend detection, face detection and face tracking + # 4. active speaker detection for the detected face clips + # 5. use iou to find the identity of each face clips, compute the F1 results + # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour + # The step 4 and 5 need less than 10 minutes + # Need about 20G space finally + # ``` + args.videoName = 'col' + args.videoFolder = args.colSavePath + args.savePath = os.path.join(args.videoFolder, args.videoName) + args.videoPath = os.path.join(args.videoFolder, args.videoName + '.mp4') + args.duration = 0 + if os.path.isfile(args.videoPath) == False: # Download video + link = 'https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s' + cmd = "youtube-dl -f best -o %s '%s'"%(args.videoPath, link) + output = subprocess.call(cmd, shell=True, stdout=None) + if os.path.isdir(args.videoFolder + '/col_labels') == False: # Download label + link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv" + cmd = "gdown --id %s -O %s"%(link, args.videoFolder + '/col_labels.tar.gz') + subprocess.call(cmd, shell=True, stdout=None) + cmd = "tar -xzvf %s -C %s"%(args.videoFolder + '/col_labels.tar.gz', args.videoFolder) + subprocess.call(cmd, shell=True, stdout=None) + os.remove(args.videoFolder + '/col_labels.tar.gz') +else: + args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + '.*'))[0] + args.savePath = os.path.join(args.videoFolder, args.videoName) + +def scene_detect(args): + # CPU: Scene detection, output is the list of each shot's time duration + videoManager = VideoManager([args.videoFilePath]) + statsManager = StatsManager() + sceneManager = SceneManager(statsManager) + sceneManager.add_detector(ContentDetector()) + baseTimecode = videoManager.get_base_timecode() + videoManager.set_downscale_factor() + videoManager.start() + sceneManager.detect_scenes(frame_source = videoManager) + sceneList = sceneManager.get_scene_list(baseTimecode) + savePath = os.path.join(args.pyworkPath, 'scene.pckl') + if sceneList == []: + sceneList = [(videoManager.get_base_timecode(),videoManager.get_current_timecode())] + with open(savePath, 'wb') as fil: + pickle.dump(sceneList, fil) + sys.stderr.write('%s - scenes detected %d\n'%(args.videoFilePath, len(sceneList))) + return sceneList + +def inference_video(args): + # GPU: Face detection, output is the list contains the face location and score in this frame + DET = S3FD(device='cuda') + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) + flist.sort() + dets = [] + for fidx, fname in enumerate(flist): + image = cv2.imread(fname) + imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale]) + dets.append([]) + for bbox in bboxes: + dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) # dets has the frames info, bbox info, conf info + sys.stderr.write('%s-%05d; %d dets\r' % (args.videoFilePath, fidx, len(dets[-1]))) + savePath = os.path.join(args.pyworkPath,'faces.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(dets, fil) + return dets + +def bb_intersection_over_union(boxA, boxB, evalCol = False): + # CPU: IOU Function to calculate overlap between two image + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + interArea = max(0, xB - xA) * max(0, yB - yA) + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + if evalCol == True: + iou = interArea / float(boxAArea) + else: + iou = interArea / float(boxAArea + boxBArea - interArea) + return iou + +def track_shot(args, sceneFaces): + # CPU: Face tracking + iouThres = 0.5 # Minimum IOU between consecutive face detections + tracks = [] + while True: + track = [] + for frameFaces in sceneFaces: + for face in frameFaces: + if track == []: + track.append(face) + frameFaces.remove(face) + elif face['frame'] - track[-1]['frame'] <= args.numFailedDet: + iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) + if iou > iouThres: + track.append(face) + frameFaces.remove(face) + continue + else: + break + if track == []: + break + elif len(track) > args.minTrack: + frameNum = numpy.array([ f['frame'] for f in track ]) + bboxes = numpy.array([numpy.array(f['bbox']) for f in track]) + frameI = numpy.arange(frameNum[0],frameNum[-1]+1) + bboxesI = [] + for ij in range(0,4): + interpfn = interp1d(frameNum, bboxes[:,ij]) + bboxesI.append(interpfn(frameI)) + bboxesI = numpy.stack(bboxesI, axis=1) + if max(numpy.mean(bboxesI[:,2]-bboxesI[:,0]), numpy.mean(bboxesI[:,3]-bboxesI[:,1])) > args.minFaceSize: + tracks.append({'frame':frameI,'bbox':bboxesI}) + return tracks + +def crop_video(args, track, cropFile): + # CPU: crop the face clips + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Read the frames + flist.sort() + vOut = cv2.VideoWriter(cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video + dets = {'x':[], 'y':[], 's':[]} + for det in track['bbox']: # Read the tracks + dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x + dets['x'].append((det[0]+det[2])/2) # crop center y + dets['s'] = signal.medfilt(dets['s'], kernel_size=13) # Smooth detections + dets['x'] = signal.medfilt(dets['x'], kernel_size=13) + dets['y'] = signal.medfilt(dets['y'], kernel_size=13) + for fidx, frame in enumerate(track['frame']): + cs = args.cropScale + bs = dets['s'][fidx] # Detection box size + bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount + image = cv2.imread(flist[frame]) + frame = numpy.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110)) + my = dets['y'][fidx] + bsi # BBox center Y + mx = dets['x'][fidx] + bsi # BBox center X + face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + vOut.write(cv2.resize(face, (224, 224))) + audioTmp = cropFile + '.wav' + audioStart = (track['frame'][0]) / 25 + audioEnd = (track['frame'][-1]+1) / 25 + vOut.release() + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % \ + (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)) + output = subprocess.call(command, shell=True, stdout=None) # Crop audio file + _, audio = wavfile.read(audioTmp) + command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % \ + (cropFile, audioTmp, args.nDataLoaderThread, cropFile)) # Combine audio and video file + output = subprocess.call(command, shell=True, stdout=None) + os.remove(cropFile + 't.avi') + return {'track':track, 'proc_track':dets} + +def extract_MFCC(file, outPath): + # CPU: extract mfcc + sr, audio = wavfile.read(file) + mfcc = python_speech_features.mfcc(audio,sr) # (N_frames, 13) [1s = 100 frames] + featuresPath = os.path.join(outPath, file.split('/')[-1].replace('.wav', '.npy')) + numpy.save(featuresPath, mfcc) + +def evaluate_network(files, args): + # GPU: active speaker detection by pretrained model + s = ASD() + s.loadParameters(args.pretrainModel) + sys.stderr.write("Model %s loaded from previous state! \r\n"%args.pretrainModel) + s.eval() + allScores = [] + # durationSet = {1,2,4,6} # To make the result more reliable + durationSet = {1,1,1,2,2,2,3,3,4,5,6} # Use this line can get more reliable result + for file in tqdm.tqdm(files, total = len(files)): + fileName = os.path.splitext(file.split('/')[-1])[0] # Load audio and video + _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + '.wav')) + audioFeature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010) + video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + '.avi')) + videoFeature = [] + while video.isOpened(): + ret, frames = video.read() + if ret == True: + face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (224,224)) + face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))] + videoFeature.append(face) + else: + break + video.release() + videoFeature = numpy.array(videoFeature) + length = min((audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0]) + audioFeature = audioFeature[:int(round(length * 100)),:] + videoFeature = videoFeature[:int(round(length * 25)),:,:] + allScore = [] # Evaluation use model + for duration in durationSet: + batchSize = int(math.ceil(length / duration)) + scores = [] + with torch.no_grad(): + for i in range(batchSize): + inputA = torch.FloatTensor(audioFeature[i * duration * 100:(i+1) * duration * 100,:]).unsqueeze(0).cuda() + inputV = torch.FloatTensor(videoFeature[i * duration * 25: (i+1) * duration * 25,:,:]).unsqueeze(0).cuda() + embedA = s.model.forward_audio_frontend(inputA) + embedV = s.model.forward_visual_frontend(inputV) + out = s.model.forward_audio_visual_backend(embedA, embedV) + score = s.lossAV.forward(out, labels = None) + scores.extend(score) + allScore.append(scores) + allScore = numpy.round((numpy.mean(numpy.array(allScore), axis = 0)), 1).astype(float) + allScores.append(allScore) + return allScores + +def visualization(tracks, scores, args): + # CPU: visulize the result for video format + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) + flist.sort() + faces = [[] for i in range(len(flist))] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track['track']['frame'].tolist()): + s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)] # average smoothing + s = numpy.mean(s) + faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) + firstImage = cv2.imread(flist[0]) + fw = firstImage.shape[1] + fh = firstImage.shape[0] + vOut = cv2.VideoWriter(os.path.join(args.pyaviPath, 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw,fh)) + colorDict = {0: 0, 1: 255} + for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)): + image = cv2.imread(fname) + for face in faces[fidx]: + clr = colorDict[int((face['score'] >= 0))] + txt = round(face['score'], 1) + cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int(face['x']+face['s']), int(face['y']+face['s'])),(0,clr,255-clr),10) + cv2.putText(image,'%s'%(txt), (int(face['x']-face['s']), int(face['y']-face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,clr,255-clr),5) + vOut.write(image) + vOut.release() + command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % \ + (os.path.join(args.pyaviPath, 'video_only.avi'), os.path.join(args.pyaviPath, 'audio.wav'), \ + args.nDataLoaderThread, os.path.join(args.pyaviPath,'video_out.avi'))) + output = subprocess.call(command, shell=True, stdout=None) + +def evaluate_col_ASD(tracks, scores, args): + txtPath = args.videoFolder + '/col_labels/fusion/*.txt' # Load labels + predictionSet = {} + for name in {'long', 'bell', 'boll', 'lieb', 'sick', 'abbas'}: + predictionSet[name] = [[],[]] + dictGT = {} + txtFiles = glob.glob("%s"%txtPath) + for file in txtFiles: + lines = open(file).read().splitlines() + idName = file.split('/')[-1][:-4] + for line in lines: + data = line.split('\t') + frame = int(int(data[0]) / 29.97 * 25) + x1 = int(data[1]) + y1 = int(data[2]) + x2 = int(data[1]) + int(data[3]) + y2 = int(data[2]) + int(data[3]) + gt = int(data[4]) + if frame in dictGT: + dictGT[frame].append([x1,y1,x2,y2,gt,idName]) + else: + dictGT[frame] = [[x1,y1,x2,y2,gt,idName]] + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Load files + flist.sort() + faces = [[] for i in range(len(flist))] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track['track']['frame'].tolist()): + s = numpy.mean(score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]) # average smoothing + faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) + for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)): + if fidx in dictGT: # This frame has label + for gtThisFrame in dictGT[fidx]: # What this label is ? + faceGT = gtThisFrame[0:4] + labelGT = gtThisFrame[4] + idGT = gtThisFrame[5] + ious = [] + for face in faces[fidx]: # Find the right face in my result + faceLocation = [int(face['x']-face['s']), int(face['y']-face['s']), int(face['x']+face['s']), int(face['y']+face['s'])] + faceLocation_new = [int(face['x']-face['s']) // 2, int(face['y']-face['s']) // 2, int(face['x']+face['s']) // 2, int(face['y']+face['s']) // 2] + iou = bb_intersection_over_union(faceLocation_new, faceGT, evalCol = True) + if iou > 0.5: + ious.append([iou, round(face['score'],2)]) + if len(ious) > 0: # Find my result + ious.sort() + labelPredict = ious[-1][1] + else: + labelPredict = 0 + x1 = faceGT[0] + y1 = faceGT[1] + width = faceGT[2] - faceGT[0] + predictionSet[idGT][0].append(labelPredict) + predictionSet[idGT][1].append(labelGT) + names = ['long', 'bell', 'boll', 'lieb', 'sick', 'abbas'] # Evaluate + names.sort() + F1s = 0 + for i in names: + scores = numpy.array(predictionSet[i][0]) + labels = numpy.array(predictionSet[i][1]) + scores = numpy.int64(scores > 0) + F1 = f1_score(labels, scores) + ACC = accuracy_score(labels, scores) + if i != 'abbas': + F1s += F1 + print("%s, ACC:%.2f, F1:%.2f"%(i, 100 * ACC, 100 * F1)) + print("Average F1:%.2f"%(100 * (F1s / 5))) + +# Main function +def main(): + # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python). + # ``` + # . + # ├── pyavi + # │   ├── audio.wav (Audio from input video) + # │   ├── video.avi (Copy of the input video) + # │   ├── video_only.avi (Output video without audio) + # │   └── video_out.avi (Output video with audio) + # ├── pycrop (The detected face videos and audios) + # │ ├── 000000.avi + # │ ├── 000000.wav + # │ ├── 000001.avi + # │ ├── 000001.wav + # │ └── ... + # ├── pyframes (All the video frames in this video) + # │ ├── 000001.jpg + # │ ├── 000002.jpg + # │ └── ... + # └── pywork + # ├── faces.pckl (face detection result) + # ├── scene.pckl (scene detection result) + # ├── scores.pckl (ASD result) + # └── tracks.pckl (face tracking result) + # ``` + + # Initialization + args.pyaviPath = os.path.join(args.savePath, 'pyavi') + args.pyframesPath = os.path.join(args.savePath, 'pyframes') + args.pyworkPath = os.path.join(args.savePath, 'pywork') + args.pycropPath = os.path.join(args.savePath, 'pycrop') + if os.path.exists(args.savePath): + rmtree(args.savePath) + os.makedirs(args.pyaviPath, exist_ok = True) # The path for the input video, input audio, output video + os.makedirs(args.pyframesPath, exist_ok = True) # Save all the video frames + os.makedirs(args.pyworkPath, exist_ok = True) # Save the results in this process by the pckl method + os.makedirs(args.pycropPath, exist_ok = True) # Save the detected face clips (audio+video) in this process + + # Extract video + args.videoFilePath = os.path.join(args.pyaviPath, 'video.avi') + # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration' + if args.duration == 0: + command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic" % \ + (args.videoPath, args.nDataLoaderThread, args.videoFilePath)) + else: + command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic" % \ + (args.videoPath, args.nDataLoaderThread, args.start, args.start + args.duration, args.videoFilePath)) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the video and save in %s \r\n" %(args.videoFilePath)) + + # Extract audio + args.audioFilePath = os.path.join(args.pyaviPath, 'audio.wav') + command = ("ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % \ + (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the audio and save in %s \r\n" %(args.audioFilePath)) + + # Extract the video frames + command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % \ + (args.videoFilePath, args.nDataLoaderThread, os.path.join(args.pyframesPath, '%06d.jpg'))) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the frames and save in %s \r\n" %(args.pyframesPath)) + + # Scene detection for the video frames + scene = scene_detect(args) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scene detection and save in %s \r\n" %(args.pyworkPath)) + + # Face detection for the video frames + faces = inference_video(args) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face detection and save in %s \r\n" %(args.pyworkPath)) + + # Face tracking + allTracks, vidTracks = [], [] + for shot in scene: + if shot[1].frame_num - shot[0].frame_num >= args.minTrack: # Discard the shot frames less than minTrack frames + allTracks.extend(track_shot(args, faces[shot[0].frame_num:shot[1].frame_num])) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face track and detected %d tracks \r\n" %len(allTracks)) + + # Face clips cropping + for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)): + vidTracks.append(crop_video(args, track, os.path.join(args.pycropPath, '%05d'%ii))) + savePath = os.path.join(args.pyworkPath, 'tracks.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(vidTracks, fil) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face Crop and saved in %s tracks \r\n" %args.pycropPath) + fil = open(savePath, 'rb') + vidTracks = pickle.load(fil) + + # Active Speaker Detection + files = glob.glob("%s/*.avi"%args.pycropPath) + files.sort() + scores = evaluate_network(files, args) + savePath = os.path.join(args.pyworkPath, 'scores.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(scores, fil) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scores extracted and saved in %s \r\n" %args.pyworkPath) + + if args.evalCol == True: + evaluate_col_ASD(vidTracks, scores, args) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want + quit() + else: + # Visualization, save the result as the new video + visualization(vidTracks, scores, args) + +if __name__ == '__main__': + main() diff --git a/my_pretrained_method/Light-ASD/LICENSE b/my_pretrained_method/Light-ASD/LICENSE new file mode 100644 index 0000000000..b755b4bbed --- /dev/null +++ b/my_pretrained_method/Light-ASD/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Liao Junhua + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/my_pretrained_method/Light-ASD/README.md b/my_pretrained_method/Light-ASD/README.md new file mode 100644 index 0000000000..57d1c5b0af --- /dev/null +++ b/my_pretrained_method/Light-ASD/README.md @@ -0,0 +1,89 @@ +## A Light Weight Model for Active Speaker Detection +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-light-weight-model-for-active-speaker/audio-visual-active-speaker-detection-on-ava)](https://paperswithcode.com/sota/audio-visual-active-speaker-detection-on-ava?p=a-light-weight-model-for-active-speaker) + +This repository contains the code and model weights for our [paper](https://openaccess.thecvf.com/content/CVPR2023/papers/Liao_A_Light_Weight_Model_for_Active_Speaker_Detection_CVPR_2023_paper.pdf) (CVPR 2023): + +> A Light Weight Model for Active Speaker Detection +> Junhua Liao, Haihan Duan, Kanghui Feng, Wanbing Zhao, Yanbing Yang, Liangyin Chen + + +*** +### Evaluate on AVA-ActiveSpeaker dataset + +#### Data preparation +Use the following code to download and preprocess the AVA dataset. +``` +python train.py --dataPathAVA AVADataPath --download +``` +The AVA dataset and the labels will be downloaded into `AVADataPath`. + +#### Training +You can train the model on the AVA dataset by using: +``` +python train.py --dataPathAVA AVADataPath +``` +`exps/exps1/score.txt`: output score file, `exps/exp1/model/model_00xx.model`: trained model, `exps/exps1/val_res.csv`: prediction for val set. + +#### Testing +Our model weights have been placed in the `weight` folder. It performs `mAP: 94.06%` in the validation set. You can check it by using: +``` +python train.py --dataPathAVA AVADataPath --evaluation +``` + + +*** +### Evaluate on Columbia ASD dataset + +#### Testing +The model weights trained on the AVA dataset have been placed in the `weight` folder. Then run the following code. +``` +python Columbia_test.py --evalCol --colSavePath colDataPath +``` +The Columbia ASD dataset and the labels will be downloaded into `colDataPath`. And you can get the following F1 result. +| Name | Bell | Boll | Lieb | Long | Sick | Avg. | +|----- | ------ | ------ | ------ | ------ | ------ | ------ | +| F1 | 82.7% | 75.7% | 87.0% | 74.5% | 85.4% | 81.1% | + +We have also provided the model weights fine-tuned on the TalkSet dataset. Due to space limitations, we did not exhibit it in the paper. Run the following code. +``` +python Columbia_test.py --evalCol --pretrainModel weight/finetuning_TalkSet.model --colSavePath colDataPath +``` +And you can get the following F1 result. +| Name | Bell | Boll | Lieb | Long | Sick | Avg. | +|----- | ------ | ------ | ------ | ------ | ------ | ------ | +| F1 | 97.7% | 86.3% | 98.2% | 99.0% | 96.3% | 95.5% | + + +*** +### An ASD Demo with pretrained Light-ASD model +You can put the raw video (`.mp4` and `.avi` are both fine) into the `demo` folder, such as `0001.mp4`. +``` +python Columbia_test.py --videoName 0001 --videoFolder demo +``` +By default, the model loads weights trained on the AVA-ActiveSpeaker dataset. If you want to load weights fine-tuned on TalkSet, you can execute the following code. +``` +python Columbia_test.py --videoName 0001 --videoFolder demo --pretrainModel weight/finetuning_TalkSet.model +``` +You can obtain the output video `demo/0001/pyavi/video_out.avi`, where the active speaker is marked by a green box and the non-active speaker by a red box. + + +*** +### Citation + +Please cite our paper if you use this code or model weights. + +``` +@InProceedings{Liao_2023_CVPR, + author = {Liao, Junhua and Duan, Haihan and Feng, Kanghui and Zhao, Wanbing and Yang, Yanbing and Chen, Liangyin}, + title = {A Light Weight Model for Active Speaker Detection}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2023}, + pages = {22932-22941} +} +``` + +*** +### Acknowledgments +Thanks for the support of TaoRuijie's open source [repository](https://github.com/TaoRuijie/TalkNet-ASD) for this research. + diff --git a/my_pretrained_method/Light-ASD/bbox.png b/my_pretrained_method/Light-ASD/bbox.png new file mode 100644 index 0000000000..e047acfe48 Binary files /dev/null and b/my_pretrained_method/Light-ASD/bbox.png differ diff --git a/my_pretrained_method/Light-ASD/bbox1.png b/my_pretrained_method/Light-ASD/bbox1.png new file mode 100644 index 0000000000..d788ad32d9 Binary files /dev/null and b/my_pretrained_method/Light-ASD/bbox1.png differ diff --git a/my_pretrained_method/Light-ASD/bbox2.png b/my_pretrained_method/Light-ASD/bbox2.png new file mode 100644 index 0000000000..90101e6fef Binary files /dev/null and b/my_pretrained_method/Light-ASD/bbox2.png differ diff --git a/my_pretrained_method/Light-ASD/dataLoader.py b/my_pretrained_method/Light-ASD/dataLoader.py new file mode 100644 index 0000000000..dde2b95b03 --- /dev/null +++ b/my_pretrained_method/Light-ASD/dataLoader.py @@ -0,0 +1,143 @@ +import os, torch, numpy, cv2, random, glob, python_speech_features +from scipy.io import wavfile +from torchvision.transforms import RandomCrop + +def generate_audio_set(dataPath, batchList): + audioSet = {} + for line in batchList: + data = line.split('\t') + videoName = data[0][:11] + dataName = data[0] + _, audio = wavfile.read(os.path.join(dataPath, videoName, dataName + '.wav')) + audioSet[dataName] = audio + return audioSet + +def overlap(dataName, audio, audioSet): + noiseName = random.sample(set(list(audioSet.keys())) - {dataName}, 1)[0] + noiseAudio = audioSet[noiseName] + snr = [random.uniform(-5, 5)] + if len(noiseAudio) < len(audio): + shortage = len(audio) - len(noiseAudio) + noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap') + else: + noiseAudio = noiseAudio[:len(audio)] + noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio ** 2)) + 1e-4) + cleanDB = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4) + noiseAudio = numpy.sqrt(10 ** ((cleanDB - noiseDB - snr) / 10)) * noiseAudio + audio = audio + noiseAudio + return audio.astype(numpy.int16) + +def load_audio(data, dataPath, numFrames, audioAug, audioSet = None): + dataName = data[0] + fps = float(data[2]) + audio = audioSet[dataName] + if audioAug == True: + augType = random.randint(0,1) + if augType == 1: + audio = overlap(dataName, audio, audioSet) + else: + audio = audio + # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps + audio = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025 * 25 / fps, winstep = 0.010 * 25 / fps) + maxAudio = int(numFrames * 4) + if audio.shape[0] < maxAudio: + shortage = maxAudio - audio.shape[0] + audio = numpy.pad(audio, ((0, shortage), (0,0)), 'wrap') + audio = audio[:int(round(numFrames * 4)),:] + return audio + +def load_visual(data, dataPath, numFrames, visualAug): + dataName = data[0] + videoName = data[0][:11] + faceFolderPath = os.path.join(dataPath, videoName, dataName) + faceFiles = glob.glob("%s/*.jpg"%faceFolderPath) + sortedFaceFiles = sorted(faceFiles, key=lambda data: (float(data.split('/')[-1][:-4])), reverse=False) + faces = [] + H = 112 + if visualAug == True: + new = int(H*random.uniform(0.7, 1)) + x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new) + M = cv2.getRotationMatrix2D((H/2,H/2), random.uniform(-15, 15), 1) + augType = random.choice(['orig', 'flip', 'crop', 'rotate']) + else: + augType = 'orig' + for faceFile in sortedFaceFiles[:numFrames]: + face = cv2.imread(faceFile) + face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (H,H)) + if augType == 'orig': + faces.append(face) + elif augType == 'flip': + faces.append(cv2.flip(face, 1)) + elif augType == 'crop': + faces.append(cv2.resize(face[y:y+new, x:x+new] , (H,H))) + elif augType == 'rotate': + faces.append(cv2.warpAffine(face, M, (H,H))) + faces = numpy.array(faces) + return faces + + +def load_label(data, numFrames): + res = [] + labels = data[3].replace('[', '').replace(']', '') + labels = labels.split(',') + for label in labels: + res.append(int(label)) + res = numpy.array(res[:numFrames]) + return res + +class train_loader(object): + def __init__(self, trialFileName, audioPath, visualPath, batchSize, **kwargs): + self.audioPath = audioPath + self.visualPath = visualPath + self.miniBatch = [] + mixLst = open(trialFileName).read().splitlines() + # sort the training set by the length of the videos, shuffle them to make more videos in the same batch belong to different movies + sortedMixLst = sorted(mixLst, key=lambda data: (int(data.split('\t')[1]), int(data.split('\t')[-1])), reverse=True) + start = 0 + while True: + length = int(sortedMixLst[start].split('\t')[1]) + end = min(len(sortedMixLst), start + max(int(batchSize / length), 1)) + self.miniBatch.append(sortedMixLst[start:end]) + if end == len(sortedMixLst): + break + start = end + + def __getitem__(self, index): + batchList = self.miniBatch[index] + numFrames = int(batchList[-1].split('\t')[1]) + audioFeatures, visualFeatures, labels = [], [], [] + audioSet = generate_audio_set(self.audioPath, batchList) # load the audios in this batch to do augmentation + for line in batchList: + data = line.split('\t') + audioFeatures.append(load_audio(data, self.audioPath, numFrames, audioAug = True, audioSet = audioSet)) + visualFeatures.append(load_visual(data, self.visualPath,numFrames, visualAug = True)) + labels.append(load_label(data, numFrames)) + return torch.FloatTensor(numpy.array(audioFeatures)), \ + torch.FloatTensor(numpy.array(visualFeatures)), \ + torch.LongTensor(numpy.array(labels)) + + def __len__(self): + return len(self.miniBatch) + + +class val_loader(object): + def __init__(self, trialFileName, audioPath, visualPath, **kwargs): + self.audioPath = audioPath + self.visualPath = visualPath + self.miniBatch = open(trialFileName).read().splitlines() + + def __getitem__(self, index): + line = [self.miniBatch[index]] + numFrames = int(line[0].split('\t')[1]) + audioSet = generate_audio_set(self.audioPath, line) + data = line[0].split('\t') + audioFeatures = [load_audio(data, self.audioPath, numFrames, audioAug = False, audioSet = audioSet)] + visualFeatures = [load_visual(data, self.visualPath,numFrames, visualAug = False)] + labels = [load_label(data, numFrames)] + return torch.FloatTensor(numpy.array(audioFeatures)), \ + torch.FloatTensor(numpy.array(visualFeatures)), \ + torch.LongTensor(numpy.array(labels)) + + def __len__(self): + return len(self.miniBatch) \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/demo/0001.mp4 b/my_pretrained_method/Light-ASD/demo/0001.mp4 new file mode 100644 index 0000000000..1c7a37754e Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001.mp4 differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyavi/audio.wav b/my_pretrained_method/Light-ASD/demo/0001/pyavi/audio.wav new file mode 100644 index 0000000000..71210a91fd Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyavi/audio.wav differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyavi/video.avi b/my_pretrained_method/Light-ASD/demo/0001/pyavi/video.avi new file mode 100644 index 0000000000..bb6da52246 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyavi/video.avi differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pycrop/00000t.avi b/my_pretrained_method/Light-ASD/demo/0001/pycrop/00000t.avi new file mode 100644 index 0000000000..e69de29bb2 diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000001.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000001.jpg new file mode 100644 index 0000000000..89584ec60d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000001.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000002.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000002.jpg new file mode 100644 index 0000000000..dea3855f23 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000002.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000003.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000003.jpg new file mode 100644 index 0000000000..dea3855f23 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000003.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000004.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000004.jpg new file mode 100644 index 0000000000..b61338b990 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000004.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000005.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000005.jpg new file mode 100644 index 0000000000..8202475b7d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000005.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000006.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000006.jpg new file mode 100644 index 0000000000..4ed4094ef7 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000006.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000007.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000007.jpg new file mode 100644 index 0000000000..e3baaf2b0b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000007.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000008.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000008.jpg new file mode 100644 index 0000000000..5adf0925ed Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000008.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000009.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000009.jpg new file mode 100644 index 0000000000..e7cafa504d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000009.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000010.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000010.jpg new file mode 100644 index 0000000000..9f336c5aa3 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000010.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000011.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000011.jpg new file mode 100644 index 0000000000..588703e469 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000011.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000012.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000012.jpg new file mode 100644 index 0000000000..00f79bdcf5 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000012.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000013.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000013.jpg new file mode 100644 index 0000000000..e78bba2fbf Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000013.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000014.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000014.jpg new file mode 100644 index 0000000000..dbbda97827 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000014.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000015.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000015.jpg new file mode 100644 index 0000000000..1c0d55ad27 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000015.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000016.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000016.jpg new file mode 100644 index 0000000000..f32ded4a10 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000016.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000017.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000017.jpg new file mode 100644 index 0000000000..47f3fac68c Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000017.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000018.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000018.jpg new file mode 100644 index 0000000000..c0e855940d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000018.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000019.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000019.jpg new file mode 100644 index 0000000000..4273a6374e Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000019.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000020.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000020.jpg new file mode 100644 index 0000000000..53af093169 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000020.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000021.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000021.jpg new file mode 100644 index 0000000000..11c2e91fa7 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000021.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000022.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000022.jpg new file mode 100644 index 0000000000..94e937cbe6 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000022.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000023.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000023.jpg new file mode 100644 index 0000000000..cbc00f164f Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000023.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000024.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000024.jpg new file mode 100644 index 0000000000..f83c3cf431 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000024.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000025.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000025.jpg new file mode 100644 index 0000000000..7e5988a249 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000025.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000026.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000026.jpg new file mode 100644 index 0000000000..660f2971e2 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000026.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000027.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000027.jpg new file mode 100644 index 0000000000..e63322e3d7 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000027.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000028.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000028.jpg new file mode 100644 index 0000000000..5508aa63ac Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000028.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000029.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000029.jpg new file mode 100644 index 0000000000..7f825a553a Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000029.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000030.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000030.jpg new file mode 100644 index 0000000000..6b7273ec6b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000030.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000031.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000031.jpg new file mode 100644 index 0000000000..ddd0ca6550 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000031.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000032.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000032.jpg new file mode 100644 index 0000000000..c2f5f9c903 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000032.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000033.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000033.jpg new file mode 100644 index 0000000000..5144c8a567 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000033.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000034.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000034.jpg new file mode 100644 index 0000000000..c17bbc1658 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000034.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000035.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000035.jpg new file mode 100644 index 0000000000..bff7fedb9c Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000035.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000036.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000036.jpg new file mode 100644 index 0000000000..bcd431b067 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000036.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000037.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000037.jpg new file mode 100644 index 0000000000..8906e352c6 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000037.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000038.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000038.jpg new file mode 100644 index 0000000000..eb54665ea4 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000038.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000039.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000039.jpg new file mode 100644 index 0000000000..c498e2559b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000039.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000040.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000040.jpg new file mode 100644 index 0000000000..cc809fca7d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000040.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000041.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000041.jpg new file mode 100644 index 0000000000..1a2f49412e Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000041.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000042.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000042.jpg new file mode 100644 index 0000000000..771e486002 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000042.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000043.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000043.jpg new file mode 100644 index 0000000000..76ab423025 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000043.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000044.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000044.jpg new file mode 100644 index 0000000000..f5803f695c Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000044.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000045.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000045.jpg new file mode 100644 index 0000000000..44235ce6a3 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000045.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000046.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000046.jpg new file mode 100644 index 0000000000..187c6a3067 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000046.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000047.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000047.jpg new file mode 100644 index 0000000000..010dfbc750 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000047.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000048.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000048.jpg new file mode 100644 index 0000000000..180d3bcf76 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000048.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000049.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000049.jpg new file mode 100644 index 0000000000..61a7991b14 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000049.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000050.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000050.jpg new file mode 100644 index 0000000000..8df7018248 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000050.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000051.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000051.jpg new file mode 100644 index 0000000000..ec4536f14e Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000051.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000052.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000052.jpg new file mode 100644 index 0000000000..a456445a4a Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000052.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000053.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000053.jpg new file mode 100644 index 0000000000..b73fc4d020 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000053.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000054.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000054.jpg new file mode 100644 index 0000000000..62c50252fb Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000054.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000055.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000055.jpg new file mode 100644 index 0000000000..de026bf9ff Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000055.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000056.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000056.jpg new file mode 100644 index 0000000000..0bb373e2de Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000056.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000057.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000057.jpg new file mode 100644 index 0000000000..83ca0f377d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000057.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000058.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000058.jpg new file mode 100644 index 0000000000..6008bec057 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000058.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000059.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000059.jpg new file mode 100644 index 0000000000..e9338640f6 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000059.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000060.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000060.jpg new file mode 100644 index 0000000000..f9ae45b7ff Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000060.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000061.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000061.jpg new file mode 100644 index 0000000000..ee8bca90a1 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000061.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000062.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000062.jpg new file mode 100644 index 0000000000..b3d7422f2e Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000062.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000063.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000063.jpg new file mode 100644 index 0000000000..8d40bc20f4 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000063.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000064.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000064.jpg new file mode 100644 index 0000000000..426d939316 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000064.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000065.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000065.jpg new file mode 100644 index 0000000000..5793d50e12 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000065.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000066.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000066.jpg new file mode 100644 index 0000000000..d2d1f63417 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000066.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000067.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000067.jpg new file mode 100644 index 0000000000..7649606f6b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000067.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000068.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000068.jpg new file mode 100644 index 0000000000..ba5dfaecd9 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000068.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000069.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000069.jpg new file mode 100644 index 0000000000..f8fe0d7a9d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000069.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000070.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000070.jpg new file mode 100644 index 0000000000..33632d6041 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000070.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000071.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000071.jpg new file mode 100644 index 0000000000..d223c0fc10 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000071.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000072.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000072.jpg new file mode 100644 index 0000000000..181227ca02 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000072.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000073.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000073.jpg new file mode 100644 index 0000000000..c371b1ba64 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000073.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000074.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000074.jpg new file mode 100644 index 0000000000..bc0e933962 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000074.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000075.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000075.jpg new file mode 100644 index 0000000000..466b98d0e4 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000075.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000076.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000076.jpg new file mode 100644 index 0000000000..822e70af2f Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000076.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000077.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000077.jpg new file mode 100644 index 0000000000..24e3e6dc61 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000077.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000078.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000078.jpg new file mode 100644 index 0000000000..709508716f Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000078.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000079.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000079.jpg new file mode 100644 index 0000000000..915ea8512d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000079.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000080.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000080.jpg new file mode 100644 index 0000000000..fc6f0c3c77 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000080.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000081.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000081.jpg new file mode 100644 index 0000000000..3fe2064690 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000081.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000082.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000082.jpg new file mode 100644 index 0000000000..c9bf62d1ff Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000082.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000083.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000083.jpg new file mode 100644 index 0000000000..cfd53bb9d1 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000083.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000084.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000084.jpg new file mode 100644 index 0000000000..f09900164b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000084.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000085.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000085.jpg new file mode 100644 index 0000000000..3cbb9b01e5 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000085.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000086.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000086.jpg new file mode 100644 index 0000000000..2e9c09dd30 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000086.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000087.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000087.jpg new file mode 100644 index 0000000000..b776ed8c93 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000087.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000088.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000088.jpg new file mode 100644 index 0000000000..0c873b0965 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000088.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000089.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000089.jpg new file mode 100644 index 0000000000..baa23fb49d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000089.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000090.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000090.jpg new file mode 100644 index 0000000000..40d4f5595b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000090.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000091.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000091.jpg new file mode 100644 index 0000000000..3dee868ce3 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000091.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000092.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000092.jpg new file mode 100644 index 0000000000..9e62405ed5 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000092.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000093.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000093.jpg new file mode 100644 index 0000000000..ed5edffe69 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000093.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000094.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000094.jpg new file mode 100644 index 0000000000..e9fbf633bf Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000094.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000095.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000095.jpg new file mode 100644 index 0000000000..82eaa32e52 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000095.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000096.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000096.jpg new file mode 100644 index 0000000000..0fe29a696c Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000096.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000097.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000097.jpg new file mode 100644 index 0000000000..e7dba85bc2 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000097.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000098.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000098.jpg new file mode 100644 index 0000000000..baabf4b98d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000098.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000099.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000099.jpg new file mode 100644 index 0000000000..b8e2b99ee4 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000099.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000100.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000100.jpg new file mode 100644 index 0000000000..e98efca8cc Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000100.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000101.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000101.jpg new file mode 100644 index 0000000000..6a8a97d641 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000101.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000102.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000102.jpg new file mode 100644 index 0000000000..6ac1677e00 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000102.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000103.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000103.jpg new file mode 100644 index 0000000000..a487b5699e Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000103.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000104.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000104.jpg new file mode 100644 index 0000000000..68b829c927 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000104.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000105.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000105.jpg new file mode 100644 index 0000000000..c7e0b77ade Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000105.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000106.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000106.jpg new file mode 100644 index 0000000000..0369497a03 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000106.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000107.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000107.jpg new file mode 100644 index 0000000000..65ac398a5b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000107.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000108.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000108.jpg new file mode 100644 index 0000000000..7366492d1d Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000108.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pyframes/000109.jpg b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000109.jpg new file mode 100644 index 0000000000..eaea186f8b Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pyframes/000109.jpg differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pywork/faces.pckl b/my_pretrained_method/Light-ASD/demo/0001/pywork/faces.pckl new file mode 100644 index 0000000000..e2db6ccbd0 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pywork/faces.pckl differ diff --git a/my_pretrained_method/Light-ASD/demo/0001/pywork/scene.pckl b/my_pretrained_method/Light-ASD/demo/0001/pywork/scene.pckl new file mode 100644 index 0000000000..657004aab3 Binary files /dev/null and b/my_pretrained_method/Light-ASD/demo/0001/pywork/scene.pckl differ diff --git a/my_pretrained_method/Light-ASD/loss.py b/my_pretrained_method/Light-ASD/loss.py new file mode 100644 index 0000000000..a7ffcc66ef --- /dev/null +++ b/my_pretrained_method/Light-ASD/loss.py @@ -0,0 +1,43 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class lossAV(nn.Module): + def __init__(self): + super(lossAV, self).__init__() + self.criterion = nn.BCELoss() + self.FC = nn.Linear(128, 2) + + def forward(self, x, labels = None, r = 1): + x = x.squeeze(1) + x = self.FC(x) + if labels == None: + predScore = x[:,1] + predScore = predScore.t() + predScore = predScore.view(-1).detach().cpu().numpy() + return predScore + else: + x1 = x / r + x1 = F.softmax(x1, dim = -1)[:,1] + nloss = self.criterion(x1, labels.float()) + predScore = F.softmax(x, dim = -1) + predLabel = torch.round(F.softmax(x, dim = -1))[:,1] + correctNum = (predLabel == labels).sum().float() + return nloss, predScore, predLabel, correctNum + + +class lossV(nn.Module): + def __init__(self): + super(lossV, self).__init__() + self.criterion = nn.BCELoss() + self.FC = nn.Linear(128, 2) + + def forward(self, x, labels, r = 1): + x = x.squeeze(1) + x = self.FC(x) + + x = x / r + x = F.softmax(x, dim = -1) + + nloss = self.criterion(x[:,1], labels.float()) + return nloss \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/model/Classifier.py b/my_pretrained_method/Light-ASD/model/Classifier.py new file mode 100644 index 0000000000..73c0c52fe9 --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/Classifier.py @@ -0,0 +1,31 @@ +import torch +from torch import nn + + +class BGRU(nn.Module): + def __init__(self, channel): + super(BGRU, self).__init__() + + self.gru_forward = nn.GRU(input_size = channel, hidden_size = channel, num_layers = 1, bidirectional = False, bias = True, batch_first = True) + self.gru_backward = nn.GRU(input_size = channel, hidden_size = channel, num_layers = 1, bidirectional = False, bias = True, batch_first = True) + + self.gelu = nn.GELU() + self.__init_weight() + + def forward(self, x): + x, _ = self.gru_forward(x) + x = self.gelu(x) + x = torch.flip(x, dims=[1]) + x, _ = self.gru_backward(x) + x = torch.flip(x, dims=[1]) + x = self.gelu(x) + + return x + + def __init_weight(self): + for m in self.modules(): + if isinstance(m, nn.GRU): + torch.nn.init.kaiming_normal_(m.weight_ih_l0) + torch.nn.init.kaiming_normal_(m.weight_hh_l0) + m.bias_ih_l0.data.zero_() + m.bias_hh_l0.data.zero_() \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/model/Encoder.py b/my_pretrained_method/Light-ASD/model/Encoder.py new file mode 100644 index 0000000000..7815a65577 --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/Encoder.py @@ -0,0 +1,164 @@ +import torch +import torch.nn as nn + + +class Audio_Block(nn.Module): + def __init__(self, in_channels, out_channels): + super(Audio_Block, self).__init__() + + self.relu = nn.ReLU() + + self.m_3 = nn.Conv2d(in_channels, out_channels, kernel_size = (3, 1), padding = (1, 0), bias = False) + self.bn_m_3 = nn.BatchNorm2d(out_channels, momentum = 0.01, eps = 0.001) + self.t_3 = nn.Conv2d(out_channels, out_channels, kernel_size = (1, 3), padding = (0, 1), bias = False) + self.bn_t_3 = nn.BatchNorm2d(out_channels, momentum = 0.01, eps = 0.001) + + self.m_5 = nn.Conv2d(in_channels, out_channels, kernel_size = (5, 1), padding = (2, 0), bias = False) + self.bn_m_5 = nn.BatchNorm2d(out_channels, momentum = 0.01, eps = 0.001) + self.t_5 = nn.Conv2d(out_channels, out_channels, kernel_size = (1, 5), padding = (0, 2), bias = False) + self.bn_t_5 = nn.BatchNorm2d(out_channels, momentum = 0.01, eps = 0.001) + + self.last = nn.Conv2d(out_channels, out_channels, kernel_size = (1, 1), padding = (0, 0), bias = False) + self.bn_last = nn.BatchNorm2d(out_channels, momentum = 0.01, eps = 0.001) + + def forward(self, x): + + x_3 = self.relu(self.bn_m_3(self.m_3(x))) + x_3 = self.relu(self.bn_t_3(self.t_3(x_3))) + + x_5 = self.relu(self.bn_m_5(self.m_5(x))) + x_5 = self.relu(self.bn_t_5(self.t_5(x_5))) + + x = x_3 + x_5 + x = self.relu(self.bn_last(self.last(x))) + + return x + + +class Visual_Block(nn.Module): + def __init__(self, in_channels, out_channels, is_down = False): + super(Visual_Block, self).__init__() + + self.relu = nn.ReLU() + + if is_down: + self.s_3 = nn.Conv3d(in_channels, out_channels, kernel_size = (1, 3, 3), stride = (1, 2, 2), padding = (0, 1, 1), bias = False) + self.bn_s_3 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + self.t_3 = nn.Conv3d(out_channels, out_channels, kernel_size = (3, 1, 1), padding = (1, 0, 0), bias = False) + self.bn_t_3 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + + self.s_5 = nn.Conv3d(in_channels, out_channels, kernel_size = (1, 5, 5), stride = (1, 2, 2), padding = (0, 2, 2), bias = False) + self.bn_s_5 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + self.t_5 = nn.Conv3d(out_channels, out_channels, kernel_size = (5, 1, 1), padding = (2, 0, 0), bias = False) + self.bn_t_5 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + else: + self.s_3 = nn.Conv3d(in_channels, out_channels, kernel_size = (1, 3, 3), padding = (0, 1, 1), bias = False) + self.bn_s_3 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + self.t_3 = nn.Conv3d(out_channels, out_channels, kernel_size = (3, 1, 1), padding = (1, 0, 0), bias = False) + self.bn_t_3 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + + self.s_5 = nn.Conv3d(in_channels, out_channels, kernel_size = (1, 5, 5), padding = (0, 2, 2), bias = False) + self.bn_s_5 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + self.t_5 = nn.Conv3d(out_channels, out_channels, kernel_size = (5, 1, 1), padding = (2, 0, 0), bias = False) + self.bn_t_5 = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + + self.last = nn.Conv3d(out_channels, out_channels, kernel_size = (1, 1, 1), padding = (0, 0, 0), bias = False) + self.bn_last = nn.BatchNorm3d(out_channels, momentum = 0.01, eps = 0.001) + + def forward(self, x): + + x_3 = self.relu(self.bn_s_3(self.s_3(x))) + x_3 = self.relu(self.bn_t_3(self.t_3(x_3))) + + x_5 = self.relu(self.bn_s_5(self.s_5(x))) + x_5 = self.relu(self.bn_t_5(self.t_5(x_5))) + + x = x_3 + x_5 + + x = self.relu(self.bn_last(self.last(x))) + + return x + + +class visual_encoder(nn.Module): + def __init__(self): + super(visual_encoder, self).__init__() + + self.block1 = Visual_Block(1, 32, is_down = True) + self.pool1 = nn.MaxPool3d(kernel_size = (1, 3, 3), stride = (1, 2, 2), padding = (0, 1, 1)) + + self.block2 = Visual_Block(32, 64) + self.pool2 = nn.MaxPool3d(kernel_size = (1, 3, 3), stride = (1, 2, 2), padding = (0, 1, 1)) + + self.block3 = Visual_Block(64, 128) + + self.maxpool = nn.AdaptiveMaxPool2d((1, 1)) + + self.__init_weight() + + def forward(self, x): + + x = self.block1(x) + x = self.pool1(x) + + x = self.block2(x) + x = self.pool2(x) + + x = self.block3(x) + + x = x.transpose(1,2) + B, T, C, W, H = x.shape + x = x.reshape(B*T, C, W, H) + + x = self.maxpool(x) + + x = x.view(B, T, C) + + return x + + def __init_weight(self): + + for m in self.modules(): + if isinstance(m, nn.Conv3d): + torch.nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm3d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + +class audio_encoder(nn.Module): + def __init__(self): + super(audio_encoder, self).__init__() + + self.block1 = Audio_Block(1, 32) + self.pool1 = nn.MaxPool3d(kernel_size = (1, 1, 3), stride = (1, 1, 2), padding = (0, 0, 1)) + + self.block2 = Audio_Block(32, 64) + self.pool2 = nn.MaxPool3d(kernel_size = (1, 1, 3), stride = (1, 1, 2), padding = (0, 0, 1)) + + self.block3 = Audio_Block(64, 128) + + self.__init_weight() + + def forward(self, x): + + x = self.block1(x) + x = self.pool1(x) + + x = self.block2(x) + x = self.pool2(x) + + x = self.block3(x) + + x = torch.mean(x, dim = 2, keepdim = True) + x = x.squeeze(2).transpose(1, 2) + + return x + + def __init_weight(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + torch.nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/model/Model.py b/my_pretrained_method/Light-ASD/model/Model.py new file mode 100644 index 0000000000..1f3ab065c2 --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/Model.py @@ -0,0 +1,43 @@ +import torch +import torch.nn as nn + +from model.Classifier import BGRU +from model.Encoder import visual_encoder, audio_encoder + +class ASD_Model(nn.Module): + def __init__(self): + super(ASD_Model, self).__init__() + + self.visualEncoder = visual_encoder() + self.audioEncoder = audio_encoder() + self.GRU = BGRU(128) + + def forward_visual_frontend(self, x): + B, T, W, H = x.shape + x = x.view(B, 1, T, W, H) + x = (x / 255 - 0.4161) / 0.1688 + x = self.visualEncoder(x) + return x + + def forward_audio_frontend(self, x): + x = x.unsqueeze(1).transpose(2, 3) + x = self.audioEncoder(x) + return x + + def forward_audio_visual_backend(self, x1, x2): + x = x1 + x2 + x = self.GRU(x) + x = torch.reshape(x, (-1, 128)) + return x + + def forward_visual_backend(self,x): + x = torch.reshape(x, (-1, 128)) + return x + + def forward(self, audioFeature, visualFeature): + audioEmbed = self.forward_audio_frontend(audioFeature) + visualEmbed = self.forward_visual_frontend(visualFeature) + outsAV = self.forward_audio_visual_backend(audioEmbed, visualEmbed) + outsV = self.forward_visual_backend(visualEmbed) + + return outsAV, outsV \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/model/faceDetector/README.md b/my_pretrained_method/Light-ASD/model/faceDetector/README.md new file mode 100644 index 0000000000..f5a8d4feb0 --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/faceDetector/README.md @@ -0,0 +1,3 @@ +# Face detector + +This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`. diff --git a/my_pretrained_method/Light-ASD/model/faceDetector/__init__.py b/my_pretrained_method/Light-ASD/model/faceDetector/__init__.py new file mode 100644 index 0000000000..059d49bf0b --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/faceDetector/__init__.py @@ -0,0 +1 @@ +from .s3fd import S3FD \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/__init__.py b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/__init__.py new file mode 100644 index 0000000000..1df05cad10 --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/__init__.py @@ -0,0 +1,66 @@ +import time, os, sys, subprocess +import numpy as np +import cv2 +import torch +from torchvision import transforms +from .nets import S3FDNet +from .box_utils import nms_ + +PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth' +if os.path.isfile(PATH_WEIGHT) == False: + Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt" + cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT) + subprocess.call(cmd, shell=True, stdout=None) +img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') + + +class S3FD(): + + def __init__(self, device='cpu'): + + tstamp = time.time() + self.device = device + + # print('[S3FD] loading with', self.device) + self.net = S3FDNet(device=self.device).to(self.device) + PATH = os.path.join('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/Light-ASD', PATH_WEIGHT) + state_dict = torch.load(PATH, map_location=self.device) + self.net.load_state_dict(state_dict) + self.net.eval() + # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) + + def detect_faces(self, image, conf_th=0.8, scales=[1]): + + w, h = image.shape[1], image.shape[0] + + bboxes = np.empty(shape=(0, 5)) + + with torch.no_grad(): + for s in scales: + scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR) + + scaled_img = np.swapaxes(scaled_img, 1, 2) + scaled_img = np.swapaxes(scaled_img, 1, 0) + scaled_img = scaled_img[[2, 1, 0], :, :] + scaled_img = scaled_img.astype('float32') + scaled_img -= img_mean + scaled_img = scaled_img[[2, 1, 0], :, :] + x = torch.from_numpy(scaled_img).unsqueeze(0).to(next(self.net.parameters()).device) + y = self.net(x) + + detections = y.data + scale = torch.Tensor([w, h, w, h]) + + for i in range(detections.size(1)): + j = 0 + while detections[0, i, j, 0] > conf_th: + score = detections[0, i, j, 0] + pt = (detections[0, i, j, 1:] * scale).cpu().numpy() + bbox = (pt[0], pt[1], pt[2], pt[3], score) + bboxes = np.vstack((bboxes, bbox)) + j += 1 + + keep = nms_(bboxes, 0.1) + bboxes = bboxes[keep] + + return bboxes diff --git a/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/box_utils.py b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/box_utils.py new file mode 100644 index 0000000000..1bf4be2cbc --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/box_utils.py @@ -0,0 +1,217 @@ +import numpy as np +from itertools import product as product +import torch +from torch.autograd import Function + + +def nms_(dets, thresh): + """ + Courtesy of Ross Girshick + [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py] + """ + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(int(i)) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return np.array(keep).astype(int) + + +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat(( + priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = scores.new(scores.size(0)).zero_().long() + if boxes.numel() == 0: + return keep, 0 + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count + + +class Detect(object): + + def __init__(self, num_classes=2, + top_k=750, nms_thresh=0.3, conf_thresh=0.05, + variance=[0.1, 0.2], nms_top_k=5000): + + self.num_classes = num_classes + self.top_k = top_k + self.nms_thresh = nms_thresh + self.conf_thresh = conf_thresh + self.variance = variance + self.nms_top_k = nms_top_k + + def forward(self, loc_data, conf_data, prior_data): + + num = loc_data.size(0) + num_priors = prior_data.size(0) + + conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) + batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) + batch_priors = batch_priors.contiguous().view(-1, 4) + + decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance) + decoded_boxes = decoded_boxes.view(num, num_priors, 4) + + output = torch.zeros(num, self.num_classes, self.top_k, 5) + + for i in range(num): + boxes = decoded_boxes[i].clone() + conf_scores = conf_preds[i].clone() + + for cl in range(1, self.num_classes): + c_mask = conf_scores[cl].gt(self.conf_thresh) + scores = conf_scores[cl][c_mask] + + if scores.dim() == 0: + continue + l_mask = c_mask.unsqueeze(1).expand_as(boxes) + boxes_ = boxes[l_mask].view(-1, 4) + ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k) + count = count if count < self.top_k else self.top_k + + output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) + + return output + + +class PriorBox(object): + + def __init__(self, input_size, feature_maps, + variance=[0.1, 0.2], + min_sizes=[16, 32, 64, 128, 256, 512], + steps=[4, 8, 16, 32, 64, 128], + clip=False): + + super(PriorBox, self).__init__() + + self.imh = input_size[0] + self.imw = input_size[1] + self.feature_maps = feature_maps + + self.variance = variance + self.min_sizes = min_sizes + self.steps = steps + self.clip = clip + + def forward(self): + mean = [] + for k, fmap in enumerate(self.feature_maps): + feath = fmap[0] + featw = fmap[1] + for i, j in product(range(feath), range(featw)): + f_kw = self.imw / self.steps[k] + f_kh = self.imh / self.steps[k] + + cx = (j + 0.5) / f_kw + cy = (i + 0.5) / f_kh + + s_kw = self.min_sizes[k] / self.imw + s_kh = self.min_sizes[k] / self.imh + + mean += [cx, cy, s_kw, s_kh] + + output = torch.FloatTensor(mean).view(-1, 4) + + if self.clip: + output.clamp_(max=1, min=0) + + return output diff --git a/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/nets.py b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/nets.py new file mode 100644 index 0000000000..ca063b6bd3 --- /dev/null +++ b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/nets.py @@ -0,0 +1,174 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from .box_utils import Detect, PriorBox + + +class L2Norm(nn.Module): + + def __init__(self, n_channels, scale): + super(L2Norm, self).__init__() + self.n_channels = n_channels + self.gamma = scale or None + self.eps = 1e-10 + self.weight = nn.Parameter(torch.Tensor(self.n_channels)) + self.reset_parameters() + + def reset_parameters(self): + init.constant_(self.weight, self.gamma) + + def forward(self, x): + norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps + x = torch.div(x, norm) + out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x + return out + + +class S3FDNet(nn.Module): + + def __init__(self, device='cpu'): + super(S3FDNet, self).__init__() + self.device = device + + self.vgg = nn.ModuleList([ + nn.Conv2d(3, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(64, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(128, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2, ceil_mode=True), + + nn.Conv2d(256, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 1, 1), + nn.ReLU(inplace=True), + ]) + + self.L2Norm3_3 = L2Norm(256, 10) + self.L2Norm4_3 = L2Norm(512, 8) + self.L2Norm5_3 = L2Norm(512, 5) + + self.extras = nn.ModuleList([ + nn.Conv2d(1024, 256, 1, 1), + nn.Conv2d(256, 512, 3, 2, padding=1), + nn.Conv2d(512, 128, 1, 1), + nn.Conv2d(128, 256, 3, 2, padding=1), + ]) + + self.loc = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(1024, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(256, 4, 3, 1, padding=1), + ]) + + self.conf = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(1024, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(256, 2, 3, 1, padding=1), + ]) + + self.softmax = nn.Softmax(dim=-1) + self.detect = Detect() + + def forward(self, x): + size = x.size()[2:] + sources = list() + loc = list() + conf = list() + + for k in range(16): + x = self.vgg[k](x) + s = self.L2Norm3_3(x) + sources.append(s) + + for k in range(16, 23): + x = self.vgg[k](x) + s = self.L2Norm4_3(x) + sources.append(s) + + for k in range(23, 30): + x = self.vgg[k](x) + s = self.L2Norm5_3(x) + sources.append(s) + + for k in range(30, len(self.vgg)): + x = self.vgg[k](x) + sources.append(x) + + # apply extra layers and cache source layer outputs + for k, v in enumerate(self.extras): + x = F.relu(v(x), inplace=True) + if k % 2 == 1: + sources.append(x) + + # apply multibox head to source layers + loc_x = self.loc[0](sources[0]) + conf_x = self.conf[0](sources[0]) + + max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) + conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) + + loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) + conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) + + for i in range(1, len(sources)): + x = sources[i] + conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) + loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) + + features_maps = [] + for i in range(len(loc)): + feat = [] + feat += [loc[i].size(1), loc[i].size(2)] + features_maps += [feat] + + loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) + conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) + + with torch.no_grad(): + self.priorbox = PriorBox(size, features_maps) + self.priors = self.priorbox.forward() + + output = self.detect.forward( + loc.view(loc.size(0), -1, 4), + self.softmax(conf.view(conf.size(0), -1, 2)), + self.priors.type(type(x.data)).to(next(self.extras.parameters()).device) + ) + + return output diff --git a/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/sfd_face.pth b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/sfd_face.pth new file mode 100644 index 0000000000..2bdf053a99 Binary files /dev/null and b/my_pretrained_method/Light-ASD/model/faceDetector/s3fd/sfd_face.pth differ diff --git a/my_pretrained_method/Light-ASD/test.png b/my_pretrained_method/Light-ASD/test.png new file mode 100644 index 0000000000..3c6e89c9f2 Binary files /dev/null and b/my_pretrained_method/Light-ASD/test.png differ diff --git a/my_pretrained_method/Light-ASD/testq.png b/my_pretrained_method/Light-ASD/testq.png new file mode 100644 index 0000000000..623798fc44 Binary files /dev/null and b/my_pretrained_method/Light-ASD/testq.png differ diff --git a/my_pretrained_method/Light-ASD/testqq.png b/my_pretrained_method/Light-ASD/testqq.png new file mode 100644 index 0000000000..40af84d6fe Binary files /dev/null and b/my_pretrained_method/Light-ASD/testqq.png differ diff --git a/my_pretrained_method/Light-ASD/testqqq.png b/my_pretrained_method/Light-ASD/testqqq.png new file mode 100644 index 0000000000..d6472df4e1 Binary files /dev/null and b/my_pretrained_method/Light-ASD/testqqq.png differ diff --git a/my_pretrained_method/Light-ASD/testttt.png b/my_pretrained_method/Light-ASD/testttt.png new file mode 100644 index 0000000000..becd250fe6 Binary files /dev/null and b/my_pretrained_method/Light-ASD/testttt.png differ diff --git a/my_pretrained_method/Light-ASD/train.py b/my_pretrained_method/Light-ASD/train.py new file mode 100644 index 0000000000..ef5ba5d6b4 --- /dev/null +++ b/my_pretrained_method/Light-ASD/train.py @@ -0,0 +1,85 @@ +import time, os, torch, argparse, warnings, glob + +from dataLoader import train_loader, val_loader +from utils.tools import * +from ASD import ASD + +def main(): + # This code is modified based on this [repository](https://github.com/TaoRuijie/TalkNet-ASD). + warnings.filterwarnings("ignore") + + parser = argparse.ArgumentParser(description = "Model Training") + # Training details + parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') + parser.add_argument('--lrDecay', type=float, default=0.95, help='Learning rate decay rate') + parser.add_argument('--maxEpoch', type=int, default=30, help='Maximum number of epochs') + parser.add_argument('--testInterval', type=int, default=1, help='Test and save every [testInterval] epochs') + parser.add_argument('--batchSize', type=int, default=2000, help='Dynamic batch size, default is 2000 frames') + parser.add_argument('--nDataLoaderThread', type=int, default=64, help='Number of loader threads') + # Data path + parser.add_argument('--dataPathAVA', type=str, default="AVADataPath", help='Save path of AVA dataset') + parser.add_argument('--savePath', type=str, default="exps/exp1") + # Data selection + parser.add_argument('--evalDataType', type=str, default="val", help='Only for AVA, to choose the dataset for evaluation, val or test') + # For download dataset only, for evaluation only + parser.add_argument('--downloadAVA', dest='downloadAVA', action='store_true', help='Only download AVA dataset and do related preprocess') + parser.add_argument('--evaluation', dest='evaluation', action='store_true', help='Only do evaluation by using pretrained model [pretrain_AVA_CVPR.model]') + args = parser.parse_args() + # Data loader + args = init_args(args) + + if args.downloadAVA == True: + preprocess_AVA(args) + quit() + + loader = train_loader(trialFileName = args.trainTrialAVA, \ + audioPath = os.path.join(args.audioPathAVA , 'train'), \ + visualPath = os.path.join(args.visualPathAVA, 'train'), \ + **vars(args)) + trainLoader = torch.utils.data.DataLoader(loader, batch_size = 1, shuffle = True, num_workers = args.nDataLoaderThread, pin_memory = True) + + loader = val_loader(trialFileName = args.evalTrialAVA, \ + audioPath = os.path.join(args.audioPathAVA , args.evalDataType), \ + visualPath = os.path.join(args.visualPathAVA, args.evalDataType), \ + **vars(args)) + valLoader = torch.utils.data.DataLoader(loader, batch_size = 1, shuffle = False, num_workers = 64, pin_memory = True) + + if args.evaluation == True: + s = ASD(**vars(args)) + s.loadParameters('weight/pretrain_AVA_CVPR.model') + print("Model %s loaded from previous state!"%('pretrain_AVA_CVPR.model')) + mAP = s.evaluate_network(loader = valLoader, **vars(args)) + print("mAP %2.2f%%"%(mAP)) + quit() + + modelfiles = glob.glob('%s/model_0*.model'%args.modelSavePath) + modelfiles.sort() + if len(modelfiles) >= 1: + print("Model %s loaded from previous state!"%modelfiles[-1]) + epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1 + s = ASD(epoch = epoch, **vars(args)) + s.loadParameters(modelfiles[-1]) + else: + epoch = 1 + s = ASD(epoch = epoch, **vars(args)) + + mAPs = [] + scoreFile = open(args.scoreSavePath, "a+") + + while(1): + loss, lr = s.train_network(epoch = epoch, loader = trainLoader, **vars(args)) + + if epoch % args.testInterval == 0: + s.saveParameters(args.modelSavePath + "/model_%04d.model"%epoch) + mAPs.append(s.evaluate_network(epoch = epoch, loader = valLoader, **vars(args))) + print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, mAP %2.2f%%, bestmAP %2.2f%%"%(epoch, mAPs[-1], max(mAPs))) + scoreFile.write("%d epoch, LR %f, LOSS %f, mAP %2.2f%%, bestmAP %2.2f%%\n"%(epoch, lr, loss, mAPs[-1], max(mAPs))) + scoreFile.flush() + + if epoch >= args.maxEpoch: + quit() + + epoch += 1 + +if __name__ == '__main__': + main() diff --git a/my_pretrained_method/Light-ASD/utils/get_ava_active_speaker_performance.py b/my_pretrained_method/Light-ASD/utils/get_ava_active_speaker_performance.py new file mode 100644 index 0000000000..2e66d1da9b --- /dev/null +++ b/my_pretrained_method/Light-ASD/utils/get_ava_active_speaker_performance.py @@ -0,0 +1,236 @@ +r"""Compute active speaker detection performance for the AVA dataset. +Please send any questions about this code to the Google Group ava-dataset-users: +https://groups.google.com/forum/#!forum/ava-dataset-users +Example usage: +python -O get_ava_active_speaker_performance.py \ +-g testdata/eval.csv \ +-p testdata/predictions.csv \ +-v +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import logging +import time, warnings +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +warnings.filterwarnings("ignore") + +def compute_average_precision(precision, recall): + """Compute Average Precision according to the definition in VOCdevkit. + Precision is modified to ensure that it does not decrease as recall + decrease. + Args: + precision: A float [N, 1] numpy array of precisions + recall: A float [N, 1] numpy array of recalls + Raises: + ValueError: if the input is not of the correct format + Returns: + average_precison: The area under the precision recall curve. NaN if + precision and recall are None. + """ + if precision is None: + if recall is not None: + raise ValueError("If precision is None, recall must also be None") + return np.NAN + + if not isinstance(precision, np.ndarray) or not isinstance( + recall, np.ndarray): + raise ValueError("precision and recall must be numpy array") + if precision.dtype != np.float or recall.dtype != np.float: + raise ValueError("input must be float numpy array.") + if len(precision) != len(recall): + raise ValueError("precision and recall must be of the same size.") + if not precision.size: + return 0.0 + if np.amin(precision) < 0 or np.amax(precision) > 1: + raise ValueError("Precision must be in the range of [0, 1].") + if np.amin(recall) < 0 or np.amax(recall) > 1: + raise ValueError("recall must be in the range of [0, 1].") + if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): + raise ValueError("recall must be a non-decreasing array") + + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Smooth precision to be monotonically decreasing. + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum( + (recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def load_csv(filename, column_names): + """Loads CSV from the filename using given column names. + Adds uid column. + Args: + filename: Path to the CSV file to load. + column_names: A list of column names for the data. + Returns: + df: A Pandas DataFrame containing the data. + """ + # Here and elsewhere, df indicates a DataFrame variable. + + df = pd.read_csv(filename, usecols=column_names) + #df = pd.read_csv(filename, header=None, names=column_names) + + # Creates a unique id from frame timestamp and entity id. + df["uid"] = (df["frame_timestamp"].map(str) + ":" + df["entity_id"]) + return df + + +def eq(a, b, tolerance=1e-09): + """Returns true if values are approximately equal.""" + return abs(a - b) <= tolerance + + +def merge_groundtruth_and_predictions(df_groundtruth, df_predictions): + """Merges groundtruth and prediction DataFrames. + The returned DataFrame is merged on uid field and sorted in descending order + by score field. Bounding boxes are checked to make sure they match between + groundtruth and predictions. + Args: + df_groundtruth: A DataFrame with groundtruth data. + df_predictions: A DataFrame with predictions data. + Returns: + df_merged: A merged DataFrame, with rows matched on uid column. + """ + if df_groundtruth["uid"].count() != df_predictions["uid"].count(): + raise ValueError( + "Groundtruth and predictions CSV must have the same number of " + "unique rows.") + # print(df_predictions["label"].unique()) + if df_predictions["label"].unique() != ["SPEAKING_AUDIBLE"]: + raise ValueError( + "Predictions CSV must contain only SPEAKING_AUDIBLE label.") + + if df_predictions["score"].count() < df_predictions["uid"].count(): + raise ValueError("Predictions CSV must contain score value for every row.") + + # Merges groundtruth and predictions on uid, validates that uid is unique + # in both frames, and sorts the resulting frame by the predictions score. + df_merged = df_groundtruth.merge( + df_predictions, + on="uid", + suffixes=("_groundtruth", "_prediction"), + validate="1:1").sort_values( + by=["score"], ascending=False).reset_index() + # Validates that bounding boxes in ground truth and predictions match for the + # same uids. + df_merged["bounding_box_correct"] = np.where( + eq(df_merged["entity_box_x1_groundtruth"], + df_merged["entity_box_x1_prediction"]) + & eq(df_merged["entity_box_x2_groundtruth"], + df_merged["entity_box_x2_prediction"]) + & eq(df_merged["entity_box_y1_groundtruth"], + df_merged["entity_box_y1_prediction"]) + & eq(df_merged["entity_box_y2_groundtruth"], + df_merged["entity_box_y2_prediction"]), True, False) + + if (~df_merged["bounding_box_correct"]).sum() > 0: + raise ValueError( + "Mismatch between groundtruth and predictions bounding boxes found at " + + str(list(df_merged[~df_merged["bounding_box_correct"]]["uid"]))) + + return df_merged + + +def get_all_positives(df_merged): + """Counts all positive examples in the groundtruth dataset.""" + return df_merged[df_merged["label_groundtruth"] == + "SPEAKING_AUDIBLE"]["uid"].count() + + +def calculate_precision_recall(df_merged): + """Calculates precision and recall arrays going through df_merged row-wise.""" + all_positives = get_all_positives(df_merged) + # Populates each row with 1 if this row is a true positive + # (at its score level). + df_merged["is_tp"] = np.where( + (df_merged["label_groundtruth"] == "SPEAKING_AUDIBLE") & + (df_merged["label_prediction"] == "SPEAKING_AUDIBLE"), 1, 0) + + # Counts true positives up to and including that row. + df_merged["tp"] = df_merged["is_tp"].cumsum() + + # Calculates precision for every row counting true positives up to + # and including that row over the index (1-based) of that row. + df_merged["precision"] = df_merged["tp"] / (df_merged.index + 1) + # Calculates recall for every row counting true positives up to + # and including that row over all positives in the groundtruth dataset. + + df_merged["recall"] = df_merged["tp"] / all_positives + logging.info( + "\n%s\n", + df_merged.head(10)[[ + "uid", "score", "label_groundtruth", "is_tp", "tp", "precision", + "recall" + ]]) + + return np.array(df_merged["precision"]), np.array(df_merged["recall"]) + + +def run_evaluation(groundtruth, predictions): + """Runs AVA Active Speaker evaluation, printing average precision result.""" + df_groundtruth = load_csv( + groundtruth, + column_names=[ + "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1", + "entity_box_x2", "entity_box_y2", "label", "entity_id" + ]) + df_predictions = load_csv( + predictions, + column_names=[ + "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1", + "entity_box_x2", "entity_box_y2", "label", "entity_id", "score" + ]) + df_merged = merge_groundtruth_and_predictions(df_groundtruth, df_predictions) + precision, recall = calculate_precision_recall(df_merged) + mAP = 100 * compute_average_precision(precision, recall) + print("average precision: %2.2f%%"%(mAP)) + return mAP + + +def parse_arguments(): + """Parses command-line flags. + Returns: + args: a named tuple containing three file objects args.labelmap, + args.groundtruth, and args.detections. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-g", + "--groundtruth", + help="CSV file containing ground truth.", + type=argparse.FileType("r"), + required=True) + parser.add_argument( + "-p", + "--predictions", + help="CSV file containing active speaker predictions.", + type=argparse.FileType("r"), + required=True) + parser.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true") + return parser.parse_args() + + +def main(): + start = time.time() + args = parse_arguments() + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + del args.verbose + mAP = run_evaluation(**vars(args)) + logging.info("Computed in %s seconds", time.time() - start) + return mAP + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/utils/tools.py b/my_pretrained_method/Light-ASD/utils/tools.py new file mode 100644 index 0000000000..b38c0cb20c --- /dev/null +++ b/my_pretrained_method/Light-ASD/utils/tools.py @@ -0,0 +1,180 @@ +import os, subprocess, glob, pandas, tqdm, cv2, numpy +from scipy.io import wavfile + +def init_args(args): + # The details for the following folders/files can be found in the annotation of the function 'preprocess_AVA' below + args.modelSavePath = os.path.join(args.savePath, 'model') + args.scoreSavePath = os.path.join(args.savePath, 'score.txt') + args.trialPathAVA = os.path.join(args.dataPathAVA, 'csv') + args.audioOrigPathAVA = os.path.join(args.dataPathAVA, 'orig_audios') + args.visualOrigPathAVA= os.path.join(args.dataPathAVA, 'orig_videos') + args.audioPathAVA = os.path.join(args.dataPathAVA, 'clips_audios') + args.visualPathAVA = os.path.join(args.dataPathAVA, 'clips_videos') + args.trainTrialAVA = os.path.join(args.trialPathAVA, 'train_loader.csv') + + if args.evalDataType == 'val': + args.evalTrialAVA = os.path.join(args.trialPathAVA, 'val_loader.csv') + args.evalOrig = os.path.join(args.trialPathAVA, 'val_orig.csv') + args.evalCsvSave = os.path.join(args.savePath, 'val_res.csv') + else: + args.evalTrialAVA = os.path.join(args.trialPathAVA, 'test_loader.csv') + args.evalOrig = os.path.join(args.trialPathAVA, 'test_orig.csv') + args.evalCsvSave = os.path.join(args.savePath, 'test_res.csv') + + os.makedirs(args.modelSavePath, exist_ok = True) + os.makedirs(args.dataPathAVA, exist_ok = True) + return args + + +def preprocess_AVA(args): + # This preprocesstion is modified based on this [repository](https://github.com/fuankarion/active-speakers-context). + # The required space is 302 G. + # If you do not have enough space, you can delate `orig_videos`(167G) when you get `clips_videos(85G)`. + # also you can delate `orig_audios`(44G) when you get `clips_audios`(6.4G). + # So the final space is less than 100G. + # The AVA dataset will be saved in 'AVApath' folder like the following format: + # ``` + # ├── clips_audios (The audio clips cut from the original movies) + # │   ├── test + # │   ├── train + # │   └── val + # ├── clips_videos (The face clips cut from the original movies, be save in the image format, frame-by-frame) + # │   ├── test + # │   ├── train + # │   └── val + # ├── csv + # │   ├── test_file_list.txt (name of the test videos) + # │   ├── test_loader.csv (The csv file we generated to load data for testing) + # │   ├── test_orig.csv (The combination of the given test csv files) + # │   ├── train_loader.csv (The csv file we generated to load data for training) + # │   ├── train_orig.csv (The combination of the given training csv files) + # │   ├── trainval_file_list.txt (name of the train/val videos) + # │   ├── val_loader.csv (The csv file we generated to load data for validation) + # │   └── val_orig.csv (The combination of the given validation csv files) + # ├── orig_audios (The original audios from the movies) + # │   ├── test + # │   └── trainval + # └── orig_videos (The original movies) + # ├── test + # └── trainval + # ``` + + download_csv(args) # Take 1 minute + download_videos(args) # Take 6 hours + extract_audio(args) # Take 1 hour + extract_audio_clips(args) # Take 3 minutes + extract_video_clips(args) # Take about 2 days + +def download_csv(args): + # Take 1 minute to download the required csv files + Link = "1C1cGxPHaJAl1NQ2i7IhRgWmdvsPhBCUy" + cmd = "gdown --id %s -O %s"%(Link, args.dataPathAVA + '/csv.tar.gz') + subprocess.call(cmd, shell=True, stdout=None) + cmd = "tar -xzvf %s -C %s"%(args.dataPathAVA + '/csv.tar.gz', args.dataPathAVA) + subprocess.call(cmd, shell=True, stdout=None) + os.remove(args.dataPathAVA + '/csv.tar.gz') + +def download_videos(args): + # Take 6 hours to download the original movies, follow this repository: https://github.com/cvdfoundation/ava-dataset + for dataType in ['trainval', 'test']: + fileList = open('%s/%s_file_list.txt'%(args.trialPathAVA, dataType)).read().splitlines() + outFolder = '%s/%s'%(args.visualOrigPathAVA, dataType) + for fileName in fileList: + cmd = "wget -P %s https://s3.amazonaws.com/ava-dataset/%s/%s"%(outFolder, dataType, fileName) + subprocess.call(cmd, shell=True, stdout=None) + +def extract_audio(args): + # Take 1 hour to extract the audio from movies + for dataType in ['trainval', 'test']: + inpFolder = '%s/%s'%(args.visualOrigPathAVA, dataType) + outFolder = '%s/%s'%(args.audioOrigPathAVA, dataType) + os.makedirs(outFolder, exist_ok = True) + videos = glob.glob("%s/*"%(inpFolder)) + for videoPath in tqdm.tqdm(videos): + audioPath = '%s/%s'%(outFolder, videoPath.split('/')[-1].split('.')[0] + '.wav') + cmd = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads 8 %s -loglevel panic" % (videoPath, audioPath)) + subprocess.call(cmd, shell=True, stdout=None) + + +def extract_audio_clips(args): + # Take 3 minutes to extract the audio clips + dic = {'train':'trainval', 'val':'trainval', 'test':'test'} + for dataType in ['train', 'val', 'test']: + df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv'%(dataType)), engine='python') + dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]]) + dfPos = df[df['label_id'] == 1] + insNeg = dfNeg['instance_id'].unique().tolist() + insPos = dfPos['instance_id'].unique().tolist() + df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True) + df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True) + entityList = df['entity_id'].unique().tolist() + df = df.groupby('entity_id') + audioFeatures = {} + outDir = os.path.join(args.audioPathAVA, dataType) + audioDir = os.path.join(args.audioOrigPathAVA, dic[dataType]) + for l in df['video_id'].unique().tolist(): + d = os.path.join(outDir, l[0]) + if not os.path.isdir(d): + os.makedirs(d) + for entity in tqdm.tqdm(entityList, total = len(entityList)): + insData = df.get_group(entity) + videoKey = insData.iloc[0]['video_id'] + start = insData.iloc[0]['frame_timestamp'] + end = insData.iloc[-1]['frame_timestamp'] + entityID = insData.iloc[0]['entity_id'] + insPath = os.path.join(outDir, videoKey, entityID+'.wav') + if videoKey not in audioFeatures.keys(): + audioFile = os.path.join(audioDir, videoKey+'.wav') + sr, audio = wavfile.read(audioFile) + audioFeatures[videoKey] = audio + audioStart = int(float(start)*sr) + audioEnd = int(float(end)*sr) + audioData = audioFeatures[videoKey][audioStart:audioEnd] + wavfile.write(insPath, sr, audioData) + +def extract_video_clips(args): + # Take about 2 days to crop the face clips. + # You can optimize this code to save time, while this process is one-time. + # If you do not need the data for the test set, you can only deal with the train and val part. That will take 1 day. + # This procession may have many warning info, you can just ignore it. + dic = {'train':'trainval', 'val':'trainval', 'test':'test'} + for dataType in ['train', 'val', 'test']: + df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv'%(dataType))) + dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]]) + dfPos = df[df['label_id'] == 1] + insNeg = dfNeg['instance_id'].unique().tolist() + insPos = dfPos['instance_id'].unique().tolist() + df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True) + df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True) + entityList = df['entity_id'].unique().tolist() + df = df.groupby('entity_id') + outDir = os.path.join(args.visualPathAVA, dataType) + audioDir = os.path.join(args.visualOrigPathAVA, dic[dataType]) + for l in df['video_id'].unique().tolist(): + d = os.path.join(outDir, l[0]) + if not os.path.isdir(d): + os.makedirs(d) + for entity in tqdm.tqdm(entityList, total = len(entityList)): + insData = df.get_group(entity) + videoKey = insData.iloc[0]['video_id'] + entityID = insData.iloc[0]['entity_id'] + videoDir = os.path.join(args.visualOrigPathAVA, dic[dataType]) + videoFile = glob.glob(os.path.join(videoDir, '{}.*'.format(videoKey)))[0] + V = cv2.VideoCapture(videoFile) + insDir = os.path.join(os.path.join(outDir, videoKey, entityID)) + if not os.path.isdir(insDir): + os.makedirs(insDir) + j = 0 + for _, row in insData.iterrows(): + imageFilename = os.path.join(insDir, str("%.2f"%row['frame_timestamp'])+'.jpg') + V.set(cv2.CAP_PROP_POS_MSEC, row['frame_timestamp'] * 1e3) + _, frame = V.read() + h = numpy.size(frame, 0) + w = numpy.size(frame, 1) + x1 = int(row['entity_box_x1'] * w) + y1 = int(row['entity_box_y1'] * h) + x2 = int(row['entity_box_x2'] * w) + y2 = int(row['entity_box_y2'] * h) + face = frame[y1:y2, x1:x2, :] + j = j+1 + cv2.imwrite(imageFilename, face) \ No newline at end of file diff --git a/my_pretrained_method/Light-ASD/weight/finetuning_TalkSet.model b/my_pretrained_method/Light-ASD/weight/finetuning_TalkSet.model new file mode 100644 index 0000000000..d613ee7152 Binary files /dev/null and b/my_pretrained_method/Light-ASD/weight/finetuning_TalkSet.model differ diff --git a/my_pretrained_method/Light-ASD/weight/pretrain_AVA_CVPR.model b/my_pretrained_method/Light-ASD/weight/pretrain_AVA_CVPR.model new file mode 100644 index 0000000000..dbc9703e3a Binary files /dev/null and b/my_pretrained_method/Light-ASD/weight/pretrain_AVA_CVPR.model differ diff --git a/my_pretrained_method/SenseVoice b/my_pretrained_method/SenseVoice new file mode 160000 index 0000000000..771252c097 --- /dev/null +++ b/my_pretrained_method/SenseVoice @@ -0,0 +1 @@ +Subproject commit 771252c0973a139a8634969f52f1d131b62031af diff --git a/my_pretrained_method/ShareGPT4Video b/my_pretrained_method/ShareGPT4Video new file mode 160000 index 0000000000..ba9b51ebfb --- /dev/null +++ b/my_pretrained_method/ShareGPT4Video @@ -0,0 +1 @@ +Subproject commit ba9b51ebfb0fe63412eea829d67e115bb16028ea diff --git a/my_pretrained_method/VideoLLaMA2 b/my_pretrained_method/VideoLLaMA2 new file mode 160000 index 0000000000..fe172b1099 --- /dev/null +++ b/my_pretrained_method/VideoLLaMA2 @@ -0,0 +1 @@ +Subproject commit fe172b1099a245a810ddcfe5497086562ec2850c diff --git a/my_pretrained_method/YOLOv8_human/0zVCgJCEXVY_1_0TEMP_MPY_wvf_snd.mp4 b/my_pretrained_method/YOLOv8_human/0zVCgJCEXVY_1_0TEMP_MPY_wvf_snd.mp4 new file mode 100644 index 0000000000..55c9875265 Binary files /dev/null and b/my_pretrained_method/YOLOv8_human/0zVCgJCEXVY_1_0TEMP_MPY_wvf_snd.mp4 differ diff --git a/my_pretrained_method/YOLOv8_human/LICENSE b/my_pretrained_method/YOLOv8_human/LICENSE new file mode 100644 index 0000000000..0ad25db4bd --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/my_pretrained_method/YOLOv8_human/README.md b/my_pretrained_method/YOLOv8_human/README.md new file mode 100644 index 0000000000..404e90cfa7 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/README.md @@ -0,0 +1,73 @@ +YOLOv8 re-implementation for person detection using PyTorch + +### Installation + +``` +conda create -n YOLO python=3.8 +conda activate YOLO +conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-lts +pip install opencv-python==4.5.5.64 +pip install PyYAML +pip install tqdm +``` + +### Train + +* Configure your dataset path in `main.py` for training +* Run `bash main.sh $ --train` for training, `$` is number of GPUs + +### Test + +* Configure your dataset path in `main.py` for testing +* Run `python main.py --test` for testing + +### Results + +| Version | COCO weights | CrowdHuman weights | +|:-------:|------------------------------------------------------------------------------------------:|---------------------------:| +| v8_n | [model](https://github.com/jahongir7174/YOLOv8-pt/releases/download/v0.0.1-alpha/v8_n.pt) | [model](./weights/best.pt) | +| v8_s | [model](https://github.com/jahongir7174/YOLOv8-pt/releases/download/v0.0.1-alpha/v8_s.pt) | - | +| v8_m | [model](https://github.com/jahongir7174/YOLOv8-pt/releases/download/v0.0.1-alpha/v8_m.pt) | - | +| v8_l | [model](https://github.com/jahongir7174/YOLOv8-pt/releases/download/v0.0.1-alpha/v8_l.pt) | - | +| v8_x | [model](https://github.com/jahongir7174/YOLOv8-pt/releases/download/v0.0.1-alpha/v8_x.pt) | - | + +* the weights are ported from original repo, see reference + +### Dataset structure + + ├── Person + ├── images + ├── train2017 + ├── 1111.jpg + ├── 2222.jpg + ├── val2017 + ├── 1111.jpg + ├── 2222.jpg + ├── labels + ├── train2017 + ├── 1111.txt + ├── 2222.txt + ├── val2017 + ├── 1111.txt + ├── 2222.txt + +* Public person detection datasets + * [COCO](https://cocodataset.org/#home) + * [CrowdHuman](https://www.crowdhuman.org/download.html) + * [HIEVE](http://humaninevents.org/data.html?title=1) + * [VisDrone](https://github.com/VisDrone/VisDrone-Dataset) + * [VFP290K](https://sites.google.com/view/dash-vfp300k/) + * [Argoverse](https://eval.ai/web/challenges/challenge-page/800/overview) + * [CEPDOF](https://vip.bu.edu/projects/vsns/cossy/datasets/cepdof/) + * [HABBOF](https://vip.bu.edu/projects/vsns/cossy/datasets/habbof/) + * [MW-R](https://vip.bu.edu/projects/vsns/cossy/datasets/mw-r/) + * [TIDOS](https://vip.bu.edu/projects/vsns/cossy/datasets/tidos/) + * [WEPDTOF](https://vip.bu.edu/projects/vsns/cossy/datasets/wepdtof/) + * [DEPOF](https://vip.bu.edu/projects/vsns/cossy/datasets/depof/) + * [FRIDA](https://vip.bu.edu/projects/vsns/cossy/datasets/frida/) + +#### Reference + +* https://github.com/ultralytics/yolov5 +* https://github.com/ultralytics/ultralytics +* https://github.com/open-mmlab/mmyolo diff --git a/my_pretrained_method/YOLOv8_human/dj.py b/my_pretrained_method/YOLOv8_human/dj.py new file mode 100644 index 0000000000..eec983f840 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/dj.py @@ -0,0 +1,117 @@ +import copy +import csv +import sys +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human/utils') +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human') +import warnings +from argparse import ArgumentParser + +import numpy +import torch +import tqdm +import yaml +from torch.utils import data +from nets import nn +from util import non_max_suppression + +warnings.filterwarnings("ignore") + + +@torch.no_grad() +def demo(img_array, model): + import cv2 + + frame = img_array + image = frame.copy() + shape = image.shape[:2] + + r = 640 / max(shape[0], shape[1]) + if r != 1: + resample = cv2.INTER_LINEAR if r > 1 else cv2.INTER_AREA + image = cv2.resize(image, dsize=(int(shape[1] * r), int(shape[0] * r)), interpolation=resample) + height, width = image.shape[:2] + + # Scale ratio (new / old) + r = min(1.0, 640 / height, 640 / width) + + # Compute padding + pad = int(round(width * r)), int(round(height * r)) + w = numpy.mod((640 - pad[0]), 32) / 2 + h = numpy.mod((640 - pad[1]), 32) / 2 + + if (width, height) != pad: # resize + image = cv2.resize(image, pad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(h - 0.1)), int(round(h + 0.1)) + left, right = int(round(w - 0.1)), int(round(w + 0.1)) + image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT) # add border + + # Convert HWC to CHW, BGR to RGB + x = image.transpose((2, 0, 1))[::-1] + x = numpy.ascontiguousarray(x) + x = torch.from_numpy(x) + x = x.unsqueeze(dim=0) + x = x.to(next(model.parameters()).device) + x = x.half() + x = x / 255 + # Inference + outputs = model(x) + # NMS + outputs = non_max_suppression(outputs, 0.25, 0.7) + final_output_box_list = [] + for output in outputs: + output[:, [0, 2]] -= w # x padding + output[:, [1, 3]] -= h # y padding + output[:, :4] /= min(height / shape[0], width / shape[1]) + + output[:, 0].clamp_(0, shape[1]) # x1 + output[:, 1].clamp_(0, shape[0]) # y1 + output[:, 2].clamp_(0, shape[1]) # x2 + output[:, 3].clamp_(0, shape[0]) # y2 + + for box in output: + box = box.cpu().numpy() + x1, y1, x2, y2, score, index = box + final_output_box_list.append((x1, y1, x2, y2)) + # cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) + del x + return final_output_box_list + + + +def profile(args, params): + model = nn.yolo_v8_n(len(params['names'])) + shape = (1, 3, args.input_size, args.input_size) + + model.eval() + model(torch.zeros(shape)) + params = sum(p.numel() for p in model.parameters()) + if args.local_rank == 0: + print(f'Number of parameters: {int(params)}') + + +def human_detect(img_array): + parser = ArgumentParser() + parser.add_argument('--input-size', default=640, type=int) + parser.add_argument('--local_rank', default=0, type=int) + + args = parser.parse_args() + + args.local_rank = int(os.getenv('LOCAL_RANK', 0)) + args.world_size = int(os.getenv('WORLD_SIZE', 1)) + args.distributed = int(os.getenv('WORLD_SIZE', 1)) > 1 + + if args.distributed: + torch.cuda.set_device(device=args.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + if args.local_rank == 0: + if not os.path.exists('weights'): + os.makedirs('weights') + + profile(args, img_array) + + demo(args,img_array) + + +if __name__ == "__main__": + main() diff --git a/my_pretrained_method/YOLOv8_human/main.py b/my_pretrained_method/YOLOv8_human/main.py new file mode 100644 index 0000000000..97f61947a6 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/main.py @@ -0,0 +1,403 @@ +import copy +import csv +import os +import warnings +from argparse import ArgumentParser + +import numpy +import torch +import tqdm +import yaml +from torch.utils import data + +from nets import nn +from utils import util +from utils.dataset import Dataset + +warnings.filterwarnings("ignore") + + +def learning_rate(args, params): + def fn(x): + return (1 - x / args.epochs) * (1.0 - params['lrf']) + params['lrf'] + + return fn + + +def train(args, params): + # Model + model = nn.yolo_v8_n(len(params['names'])) + model = util.load_weight('./weights/v8_n.pt', model) + model.cuda() + + # Optimizer + accumulate = max(round(64 / (args.batch_size * args.world_size)), 1) + params['weight_decay'] *= args.batch_size * args.world_size * accumulate / 64 + + p = [], [], [] + for v in model.modules(): + if hasattr(v, 'bias') and isinstance(v.bias, torch.nn.Parameter): + p[2].append(v.bias) + if isinstance(v, torch.nn.BatchNorm2d): + p[1].append(v.weight) + elif hasattr(v, 'weight') and isinstance(v.weight, torch.nn.Parameter): + p[0].append(v.weight) + + optimizer = torch.optim.SGD(p[2], params['lr0'], params['momentum'], nesterov=True) + + optimizer.add_param_group({'params': p[0], 'weight_decay': params['weight_decay']}) + optimizer.add_param_group({'params': p[1]}) + del p + + # Scheduler + lr = learning_rate(args, params) + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr, last_epoch=-1) + + # EMA + ema = util.EMA(model) if args.local_rank == 0 else None + + filenames = [] + for filename in os.listdir('../Dataset/CrowdHuman/images/train'): + filenames.append('../Dataset/CrowdHuman/images/train/' + filename) + + sampler = None + dataset = Dataset(filenames, args.input_size, params, True) + + if args.distributed: + sampler = data.distributed.DistributedSampler(dataset) + + loader = data.DataLoader(dataset, args.batch_size, sampler is None, sampler, + num_workers=4, pin_memory=True, collate_fn=Dataset.collate_fn) + + if args.distributed: + # DDP mode + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + model = torch.nn.parallel.DistributedDataParallel(module=model, + device_ids=[args.local_rank], + output_device=args.local_rank) + + # Start training + best = 0 + num_batch = len(loader) + amp_scale = torch.cuda.amp.GradScaler() + criterion = util.ComputeLoss(model, params) + num_warmup = max(round(params['warmup_epochs'] * num_batch), 1000) + with open('weights/step.csv', 'w') as f: + if args.local_rank == 0: + writer = csv.DictWriter(f, fieldnames=['epoch', + 'box', 'dfl', 'cls', + 'Recall', 'Precision', 'mAP@50', 'mAP']) + writer.writeheader() + for epoch in range(args.epochs): + model.train() + if args.distributed: + sampler.set_epoch(epoch) + if args.epochs - epoch == 10: + loader.dataset.mosaic = False + + p_bar = enumerate(loader) + + if args.local_rank == 0: + print(('\n' + '%10s' * 5) % ('epoch', 'memory', 'box', 'cls', 'dfl')) + if args.local_rank == 0: + p_bar = tqdm.tqdm(p_bar, total=num_batch) # progress bar + + optimizer.zero_grad() + avg_box_loss = util.AverageMeter() + avg_dfl_loss = util.AverageMeter() + avg_cls_loss = util.AverageMeter() + for i, (samples, targets) in p_bar: + x = i + num_batch * epoch # number of iterations + samples = samples.cuda().float() / 255 + + # Warmup + if x <= num_warmup: + xp = [0, num_warmup] + fp = [1, 64 / (args.batch_size * args.world_size)] + accumulate = max(1, numpy.interp(x, xp, fp).round()) + for j, y in enumerate(optimizer.param_groups): + if j == 0: + fp = [params['warmup_bias_lr'], y['initial_lr'] * lr(epoch)] + else: + fp = [0.0, y['initial_lr'] * lr(epoch)] + y['lr'] = numpy.interp(x, xp, fp) + if 'momentum' in y: + fp = [params['warmup_momentum'], params['momentum']] + y['momentum'] = numpy.interp(x, xp, fp) + + # Forward + with torch.cuda.amp.autocast(): + outputs = model(samples) # forward + loss_box, loss_cls, loss_dfl = criterion(outputs, targets) + + avg_box_loss.update(loss_box.item(), samples.size(0)) + avg_dfl_loss.update(loss_box.item(), samples.size(0)) + avg_cls_loss.update(loss_cls.item(), samples.size(0)) + + loss_box *= args.batch_size # loss scaled by batch_size + loss_dfl *= args.batch_size # loss scaled by batch_size + loss_cls *= args.batch_size # loss scaled by batch_size + loss_box *= args.world_size # gradient averaged between devices in DDP mode + loss_dfl *= args.world_size # gradient averaged between devices in DDP mode + loss_cls *= args.world_size # gradient averaged between devices in DDP mode + + # Backward + amp_scale.scale(loss_box + loss_cls + loss_dfl).backward() + + # Optimize + if x % accumulate == 0: + amp_scale.unscale_(optimizer) # unscale gradients + util.clip_gradients(model) # clip gradients + amp_scale.step(optimizer) # optimizer.step + amp_scale.update() + optimizer.zero_grad() + if ema: + ema.update(model) + + # Log + if args.local_rank == 0: + memory = f'{torch.cuda.memory_reserved() / 1E9:.3g}G' # (GB) + s = ('%10s' * 2 + '%10.3g' * 3) % (f'{epoch + 1}/{args.epochs}', memory, + avg_box_loss.avg, avg_cls_loss.avg, avg_dfl_loss.avg) + p_bar.set_description(s) + + # Scheduler + scheduler.step() + + if args.local_rank == 0: + # mAP + last = test(args, params, ema.ema) + writer.writerow({'epoch': str(epoch + 1).zfill(3), + 'box': str(f'{avg_box_loss.avg:.3f}'), + 'cls': str(f'{avg_cls_loss.avg:.3f}'), + 'dfl': str(f'{avg_dfl_loss.avg:.3f}'), + 'mAP': str(f'{last[0]:.3f}'), + 'mAP@50': str(f'{last[1]:.3f}'), + 'Recall': str(f'{last[2]:.3f}'), + 'Precision': str(f'{last[2]:.3f}')}) + f.flush() + + # Update best mAP + if last[0] > best: + best = last[0] + + # Save model + save = {'model': copy.deepcopy(ema.ema).half()} + + # Save last, best and delete + torch.save(save, './weights/last.pt') + if best == last[0]: + torch.save(save, './weights/best.pt') + del save + + if args.local_rank == 0: + util.strip_optimizer('./weights/best.pt') # strip optimizers + util.strip_optimizer('./weights/last.pt') # strip optimizers + + torch.cuda.empty_cache() + + +@torch.no_grad() +def test(args, params, model=None): + filenames = [] + for filename in os.listdir('../Dataset/CrowdHuman/images/val'): + filenames.append('../Dataset/CrowdHuman/images/val/' + filename) + numpy.random.shuffle(filenames) + dataset = Dataset(filenames, args.input_size, params, augment=False) + loader = data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4, + pin_memory=True, collate_fn=Dataset.collate_fn) + + if model is None: + model = torch.load('./weights/best.pt', map_location='cuda')['model'].float() + + model.half() + model.eval() + + # Configure + iou_v = torch.linspace(0.5, 0.95, 10).cuda() # iou vector for mAP@0.5:0.95 + n_iou = iou_v.numel() + + m_pre = 0. + m_rec = 0. + map50 = 0. + mean_ap = 0. + metrics = [] + p_bar = tqdm.tqdm(loader, desc=('%10s' * 5) % ('', 'precision', 'recall', 'mAP50', 'mAP')) + for samples, targets in p_bar: + samples = samples.cuda() + samples = samples.half() # uint8 to fp16/32 + samples = samples / 255. # 0 - 255 to 0.0 - 1.0 + _, _, h, w = samples.shape # batch size, channels, height, width + scale = torch.tensor((w, h, w, h)).cuda() + # Inference + outputs = model(samples) + # NMS + outputs = util.non_max_suppression(outputs, 0.001, 0.7) + # Metrics + for i, output in enumerate(outputs): + idx = targets['idx'] == i + cls = targets['cls'][idx] + box = targets['box'][idx] + + cls = cls.cuda() + box = box.cuda() + + metric = torch.zeros(output.shape[0], n_iou, dtype=torch.bool).cuda() + + if output.shape[0] == 0: + if cls.shape[0]: + metrics.append((metric, *torch.zeros((2, 0)).cuda(), cls.squeeze(-1))) + continue + # Evaluate + if cls.shape[0]: + target = torch.cat((cls, util.wh2xy(box) * scale), 1) + metric = util.compute_metric(output[:, :6], target, iou_v) + # Append + metrics.append((metric, output[:, 4], output[:, 5], cls.squeeze(-1))) + + # Compute metrics + metrics = [torch.cat(x, 0).cpu().numpy() for x in zip(*metrics)] # to numpy + if len(metrics) and metrics[0].any(): + tp, fp, m_pre, m_rec, map50, mean_ap = util.compute_ap(*metrics) + # Print results + print(('%10s' + '%10.3g' * 4) % ("", m_pre, m_rec, map50, mean_ap)) + # Return results + model.float() # for training + return mean_ap, map50, m_rec, m_pre + + +@torch.no_grad() +def demo(args): + import cv2 + + # Load model + model = torch.load('./weights/best.pt', map_location='cuda')['model'].float() + model.half() + model.eval() + + camera = cv2.VideoCapture(0) + # Check if camera opened successfully + if not camera.isOpened(): + print("Error opening video stream or file") + # Read until video is completed + while camera.isOpened(): + # Capture frame-by-frame + success, frame = camera.read() + if success: + image = frame.copy() + shape = image.shape[:2] + + r = args.input_size / max(shape[0], shape[1]) + if r != 1: + resample = cv2.INTER_LINEAR if r > 1 else cv2.INTER_AREA + image = cv2.resize(image, dsize=(int(shape[1] * r), int(shape[0] * r)), interpolation=resample) + height, width = image.shape[:2] + + # Scale ratio (new / old) + r = min(1.0, args.input_size / height, args.input_size / width) + + # Compute padding + pad = int(round(width * r)), int(round(height * r)) + w = numpy.mod((args.input_size - pad[0]), 32) / 2 + h = numpy.mod((args.input_size - pad[1]), 32) / 2 + + if (width, height) != pad: # resize + image = cv2.resize(image, pad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(h - 0.1)), int(round(h + 0.1)) + left, right = int(round(w - 0.1)), int(round(w + 0.1)) + image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT) # add border + + # Convert HWC to CHW, BGR to RGB + x = image.transpose((2, 0, 1))[::-1] + x = numpy.ascontiguousarray(x) + x = torch.from_numpy(x) + x = x.unsqueeze(dim=0) + x = x.cuda() + x = x.half() + x = x / 255 + # Inference + outputs = model(x) + # NMS + outputs = util.non_max_suppression(outputs, 0.25, 0.7) + for output in outputs: + output[:, [0, 2]] -= w # x padding + output[:, [1, 3]] -= h # y padding + output[:, :4] /= min(height / shape[0], width / shape[1]) + + output[:, 0].clamp_(0, shape[1]) # x1 + output[:, 1].clamp_(0, shape[0]) # y1 + output[:, 2].clamp_(0, shape[1]) # x2 + output[:, 3].clamp_(0, shape[0]) # y2 + + for box in output: + box = box.cpu().numpy() + x1, y1, x2, y2, score, index = box + cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) + + cv2.imshow('Frame', frame) + # Press Q on keyboard to exit + if cv2.waitKey(25) & 0xFF == ord('q'): + break + # Break the loop + else: + break + # When everything done, release the video capture object + camera.release() + + # Closes all the frames + cv2.destroyAllWindows() + + +def profile(args, params): + model = nn.yolo_v8_n(len(params['names'])) + shape = (1, 3, args.input_size, args.input_size) + + model.eval() + model(torch.zeros(shape)) + params = sum(p.numel() for p in model.parameters()) + if args.local_rank == 0: + print(f'Number of parameters: {int(params)}') + + +def main(): + parser = ArgumentParser() + parser.add_argument('--input-size', default=640, type=int) + parser.add_argument('--batch-size', default=32, type=int) + parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument('--epochs', default=300, type=int) + parser.add_argument('--train', action='store_true') + parser.add_argument('--test', action='store_true') + parser.add_argument('--demo', action='store_true') + + args = parser.parse_args() + + args.local_rank = int(os.getenv('LOCAL_RANK', 0)) + args.world_size = int(os.getenv('WORLD_SIZE', 1)) + args.distributed = int(os.getenv('WORLD_SIZE', 1)) > 1 + + if args.distributed: + torch.cuda.set_device(device=args.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + if args.local_rank == 0: + if not os.path.exists('weights'): + os.makedirs('weights') + + with open('utils/args.yaml', errors='ignore') as f: + params = yaml.safe_load(f) + + util.setup_seed() + util.setup_multi_processes() + + profile(args, params) + + if args.train: + train(args, params) + if args.test: + test(args, params) + if args.demo: + demo(args) + + +if __name__ == "__main__": + main() diff --git a/my_pretrained_method/YOLOv8_human/main.sh b/my_pretrained_method/YOLOv8_human/main.sh new file mode 100644 index 0000000000..12619097a8 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/main.sh @@ -0,0 +1,2 @@ +GPUS=$1 +python3 -m torch.distributed.launch --nproc_per_node=$GPUS main.py ${@:2} \ No newline at end of file diff --git a/my_pretrained_method/YOLOv8_human/nets/nn.py b/my_pretrained_method/YOLOv8_human/nets/nn.py new file mode 100644 index 0000000000..03e147c38a --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/nets/nn.py @@ -0,0 +1,255 @@ +# import os +# os.chdir('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human') +import sys +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human/utils') + +import math +import torch +from util import make_anchors + + +def fuse_conv(conv, norm): + fused_conv = torch.nn.Conv2d(conv.in_channels, + conv.out_channels, + kernel_size=conv.kernel_size, + stride=conv.stride, + padding=conv.padding, + groups=conv.groups, + bias=True).requires_grad_(False).to(conv.weight.device) + + w_conv = conv.weight.clone().view(conv.out_channels, -1) + w_norm = torch.diag(norm.weight.div(torch.sqrt(norm.eps + norm.running_var))) + fused_conv.weight.copy_(torch.mm(w_norm, w_conv).view(fused_conv.weight.size())) + + b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias + b_norm = norm.bias - norm.weight.mul(norm.running_mean).div(torch.sqrt(norm.running_var + norm.eps)) + fused_conv.bias.copy_(torch.mm(w_norm, b_conv.reshape(-1, 1)).reshape(-1) + b_norm) + + return fused_conv + + +class Conv(torch.nn.Module): + def __init__(self, in_ch, out_ch, k=1, s=1): + super().__init__() + self.conv = torch.nn.Conv2d(in_ch, out_ch, k, s, (k - 1) // 2, bias=False) + self.norm = torch.nn.BatchNorm2d(out_ch, eps=0.001, momentum=0.03) + self.relu = torch.nn.SiLU(inplace=True) + + def forward(self, x): + return self.relu(self.norm(self.conv(x))) + + def fuse_forward(self, x): + return self.relu(self.conv(x)) + + +class Residual(torch.nn.Module): + def __init__(self, ch, add=True): + super().__init__() + self.add_m = add + self.res_m = torch.nn.Sequential(Conv(ch, ch, 3), + Conv(ch, ch, 3)) + + def forward(self, x): + return self.res_m(x) + x if self.add_m else self.res_m(x) + + +class CSP(torch.nn.Module): + def __init__(self, in_ch, out_ch, n=1, add=True): + super().__init__() + self.conv1 = Conv(in_ch, out_ch // 2) + self.conv2 = Conv(in_ch, out_ch // 2) + self.conv3 = Conv((2 + n) * out_ch // 2, out_ch) + self.res_m = torch.nn.ModuleList(Residual(out_ch // 2, add) for _ in range(n)) + + def forward(self, x): + y = [self.conv1(x), self.conv2(x)] + y.extend(m(y[-1]) for m in self.res_m) + return self.conv3(torch.cat(y, dim=1)) + + +class SPP(torch.nn.Module): + def __init__(self, in_ch, out_ch, k=5): + super().__init__() + self.conv1 = Conv(in_ch, in_ch // 2) + self.conv2 = Conv(in_ch * 2, out_ch) + self.res_m = torch.nn.MaxPool2d(k, 1, k // 2) + + def forward(self, x): + x = self.conv1(x) + y1 = self.res_m(x) + y2 = self.res_m(y1) + return self.conv2(torch.cat([x, y1, y2, self.res_m(y2)], 1)) + + +class DarkNet(torch.nn.Module): + def __init__(self, width, depth): + super().__init__() + self.p1 = [] + self.p2 = [] + self.p3 = [] + self.p4 = [] + self.p5 = [] + self.p1.append(Conv(width[0], width[1], 3, 2)) + self.p2.append(Conv(width[1], width[2], 3, 2)) + self.p2.append(CSP(width[2], width[2], depth[0])) + self.p3.append(Conv(width[2], width[3], 3, 2)) + self.p3.append(CSP(width[3], width[3], depth[1])) + self.p4.append(Conv(width[3], width[4], 3, 2)) + self.p4.append(CSP(width[4], width[4], depth[2])) + self.p5.append(Conv(width[4], width[5], 3, 2)) + self.p5.append(CSP(width[5], width[5], depth[0])) + self.p5.append(SPP(width[5], width[5])) + + self.p1 = torch.nn.Sequential(*self.p1) + self.p2 = torch.nn.Sequential(*self.p2) + self.p3 = torch.nn.Sequential(*self.p3) + self.p4 = torch.nn.Sequential(*self.p4) + self.p5 = torch.nn.Sequential(*self.p5) + + def forward(self, x): + p1 = self.p1(x) + p2 = self.p2(p1) + p3 = self.p3(p2) + p4 = self.p4(p3) + p5 = self.p5(p4) + return p3, p4, p5 + + +class DarkFPN(torch.nn.Module): + def __init__(self, width, depth): + super().__init__() + self.up = torch.nn.Upsample(None, 2) + self.h1 = CSP(width[4] + width[5], width[4], depth[0], False) + self.h2 = CSP(width[3] + width[4], width[3], depth[0], False) + self.h3 = Conv(width[3], width[3], 3, 2) + self.h4 = CSP(width[3] + width[4], width[4], depth[0], False) + self.h5 = Conv(width[4], width[4], 3, 2) + self.h6 = CSP(width[4] + width[5], width[5], depth[0], False) + + def forward(self, x): + p3, p4, p5 = x + p4 = self.h1(torch.cat([self.up(p5), p4], 1)) + p3 = self.h2(torch.cat([self.up(p4), p3], 1)) + p4 = self.h4(torch.cat([self.h3(p3), p4], 1)) + p5 = self.h6(torch.cat([self.h5(p4), p5], 1)) + return p3, p4, p5 + + +class DFL(torch.nn.Module): + # Generalized Focal Loss + # https://ieeexplore.ieee.org/document/9792391 + def __init__(self, ch=16): + super().__init__() + self.ch = ch + self.conv = torch.nn.Conv2d(ch, 1, 1, bias=False).requires_grad_(False) + x = torch.arange(ch, dtype=torch.float).view(1, ch, 1, 1) + self.conv.weight.data[:] = torch.nn.Parameter(x) + + def forward(self, x): + b, c, a = x.shape + x = x.view(b, 4, self.ch, a).transpose(2, 1) + return self.conv(x.softmax(1)).view(b, 4, a) + + +class Head(torch.nn.Module): + anchors = torch.empty(0) + strides = torch.empty(0) + + def __init__(self, nc=80, filters=()): + super().__init__() + self.ch = 16 # DFL channels + self.nc = nc # number of classes + self.nl = len(filters) # number of detection layers + self.no = nc + self.ch * 4 # number of outputs per anchor + self.stride = torch.zeros(self.nl) # strides computed during build + + c1 = max(filters[0], self.nc) + c2 = max((filters[0] // 4, self.ch * 4)) + + self.dfl = DFL(self.ch) + self.cls = torch.nn.ModuleList(torch.nn.Sequential(Conv(x, c1, 3), + Conv(c1, c1, 3), + torch.nn.Conv2d(c1, self.nc, 1)) for x in filters) + self.box = torch.nn.ModuleList(torch.nn.Sequential(Conv(x, c2, 3), + Conv(c2, c2, 3), + torch.nn.Conv2d(c2, 4 * self.ch, 1)) for x in filters) + + def forward(self, x): + for i in range(self.nl): + x[i] = torch.cat((self.box[i](x[i]), self.cls[i](x[i])), 1) + if self.training: + return x + self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) + + x = torch.cat([i.view(x[0].shape[0], self.no, -1) for i in x], 2) + box, cls = x.split((self.ch * 4, self.nc), 1) + a, b = torch.split(self.dfl(box), 2, 1) + a = self.anchors.unsqueeze(0) - a + b = self.anchors.unsqueeze(0) + b + box = torch.cat(((a + b) / 2, b - a), 1) + return torch.cat((box * self.strides, cls.sigmoid()), 1) + + def initialize_biases(self): + # Initialize biases + # WARNING: requires stride availability + m = self + for a, b, s in zip(m.box, m.cls, m.stride): + a[-1].bias.data[:] = 1.0 # box + # cls (.01 objects, 80 classes, 640 img) + b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) + + +class YOLO(torch.nn.Module): + def __init__(self, width, depth, num_classes): + super().__init__() + self.net = DarkNet(width, depth) + self.fpn = DarkFPN(width, depth) + + img_dummy = torch.zeros(1, 3, 256, 256) + self.head = Head(num_classes, (width[3], width[4], width[5])) + self.head.stride = torch.tensor([256 / x.shape[-2] for x in self.forward(img_dummy)]) + self.stride = self.head.stride + self.head.initialize_biases() + + def forward(self, x): + x = self.net(x) + x = self.fpn(x) + return self.head(list(x)) + + def fuse(self): + for m in self.modules(): + if type(m) is Conv and hasattr(m, 'norm'): + m.conv = fuse_conv(m.conv, m.norm) + m.forward = m.fuse_forward + delattr(m, 'norm') + return self + + +def yolo_v8_n(num_classes: int = 80): + depth = [1, 2, 2] + width = [3, 16, 32, 64, 128, 256] + return YOLO(width, depth, num_classes) + + +def yolo_v8_s(num_classes: int = 80): + depth = [1, 2, 2] + width = [3, 32, 64, 128, 256, 512] + return YOLO(width, depth, num_classes) + + +def yolo_v8_m(num_classes: int = 80): + depth = [2, 4, 4] + width = [3, 48, 96, 192, 384, 576] + return YOLO(width, depth, num_classes) + + +def yolo_v8_l(num_classes: int = 80): + depth = [3, 6, 6] + width = [3, 64, 128, 256, 512, 512] + return YOLO(width, depth, num_classes) + + +def yolo_v8_x(num_classes: int = 80): + depth = [3, 6, 6] + width = [3, 80, 160, 320, 640, 640] + return YOLO(width, depth, num_classes) diff --git a/my_pretrained_method/YOLOv8_human/temp_video.mp4 b/my_pretrained_method/YOLOv8_human/temp_video.mp4 new file mode 100644 index 0000000000..461b14f46d Binary files /dev/null and b/my_pretrained_method/YOLOv8_human/temp_video.mp4 differ diff --git a/my_pretrained_method/YOLOv8_human/utils/args.yaml b/my_pretrained_method/YOLOv8_human/utils/args.yaml new file mode 100644 index 0000000000..3ebabd5c26 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/utils/args.yaml @@ -0,0 +1,23 @@ +lr0: 0.010 # initial learning rate (SGD=1E-2, Adam=1E-3) +lrf: 0.010 # final OneCycleLR learning rate (lr0 * lrf) +momentum: 0.93700000 # SGD momentum/Adam beta1 +weight_decay: 0.0005 # optimizer weight decay 5e-4 +warmup_epochs: 3.000 # warmup epochs +warmup_momentum: 0.8 # warmup initial momentum +warmup_bias_lr: 0.10 # warmup initial bias lr +box: 7.5 # box loss gain +cls: 0.5 # cls loss gain +dfl: 1.5 # cls loss gain +hsv_h: 0.015000 # image HSV-Hue augmentation (fraction) +hsv_s: 0.700000 # image HSV-Saturation augmentation (fraction) +hsv_v: 0.400000 # image HSV-Value augmentation (fraction) +degrees: 0.0000 # image rotation (+/- deg) +translate: 0.10 # image translation (+/- fraction) +scale: 0.500000 # image scale (+/- gain) +shear: 0.000000 # image shear (+/- deg) +flip_ud: 0.0000 # image flip up-down (probability) +flip_lr: 0.5000 # image flip left-right (probability) +mosaic: 1.00000 # image mosaic (probability) +mix_up: 0.00000 # image mix-up (probability) +names: + 0: person \ No newline at end of file diff --git a/my_pretrained_method/YOLOv8_human/utils/dataset.py b/my_pretrained_method/YOLOv8_human/utils/dataset.py new file mode 100644 index 0000000000..d76b0decc8 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/utils/dataset.py @@ -0,0 +1,418 @@ +import math +import os +import random + +import cv2 +import numpy +import torch +from PIL import Image +from torch.utils import data + +FORMATS = 'bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp' + + +class Dataset(data.Dataset): + def __init__(self, filenames, input_size, params, augment): + self.params = params + self.mosaic = augment + self.augment = augment + self.input_size = input_size + + # Read labels + labels = self.load_label(filenames) + self.labels = list(labels.values()) + self.filenames = list(labels.keys()) # update + self.n = len(self.filenames) # number of samples + self.indices = range(self.n) + # Albumentations (optional, only used if package is installed) + self.albumentations = Albumentations() + + def __getitem__(self, index): + index = self.indices[index] + + params = self.params + mosaic = self.mosaic and random.random() < params['mosaic'] + + if mosaic: + # Load MOSAIC + image, label = self.load_mosaic(index, params) + # MixUp augmentation + if random.random() < params['mix_up']: + index = random.choice(self.indices) + mix_image1, mix_label1 = image, label + mix_image2, mix_label2 = self.load_mosaic(index, params) + + image, label = mix_up(mix_image1, mix_label1, mix_image2, mix_label2) + else: + # Load image + image, shape = self.load_image(index) + h, w = image.shape[:2] + + # Resize + image, ratio, pad = resize(image, self.input_size, self.augment) + + label = self.labels[index].copy() + if label.size: + label[:, 1:] = wh2xy(label[:, 1:], ratio[0] * w, ratio[1] * h, pad[0], pad[1]) + if self.augment: + image, label = random_perspective(image, label, params) + + nl = len(label) # number of labels + h, w = image.shape[:2] + cls = label[:, 0:1] + box = label[:, 1:5] + box = xy2wh(box, w, h) + + if self.augment: + # Albumentations + image, box, cls = self.albumentations(image, box, cls) + nl = len(box) # update after albumentations + # HSV color-space + augment_hsv(image, params) + # Flip up-down + if random.random() < params['flip_ud']: + image = numpy.flipud(image) + if nl: + box[:, 1] = 1 - box[:, 1] + # Flip left-right + if random.random() < params['flip_lr']: + image = numpy.fliplr(image) + if nl: + box[:, 0] = 1 - box[:, 0] + + target_cls = torch.zeros((nl, 1)) + target_box = torch.zeros((nl, 4)) + if nl: + target_cls = torch.from_numpy(cls) + target_box = torch.from_numpy(box) + + # Convert HWC to CHW, BGR to RGB + sample = image.transpose((2, 0, 1))[::-1] + sample = numpy.ascontiguousarray(sample) + + return torch.from_numpy(sample), target_cls, target_box, torch.zeros(nl) + + def __len__(self): + return len(self.filenames) + + def load_image(self, i): + image = cv2.imread(self.filenames[i]) + h, w = image.shape[:2] + r = self.input_size / max(h, w) + if r != 1: + image = cv2.resize(image, + dsize=(int(w * r), int(h * r)), + interpolation=resample() if self.augment else cv2.INTER_LINEAR) + return image, (h, w) + + def load_mosaic(self, index, params): + label4 = [] + border = [-self.input_size // 2, -self.input_size // 2] + image4 = numpy.full((self.input_size * 2, self.input_size * 2, 3), 0, dtype=numpy.uint8) + y1a, y2a, x1a, x2a, y1b, y2b, x1b, x2b = (None, None, None, None, None, None, None, None) + + xc = int(random.uniform(-border[0], 2 * self.input_size + border[1])) + yc = int(random.uniform(-border[0], 2 * self.input_size + border[1])) + + indices = [index] + random.choices(self.indices, k=3) + random.shuffle(indices) + + for i, index in enumerate(indices): + # Load image + image, _ = self.load_image(index) + shape = image.shape + if i == 0: # top left + x1a = max(xc - shape[1], 0) + y1a = max(yc - shape[0], 0) + x2a = xc + y2a = yc + x1b = shape[1] - (x2a - x1a) + y1b = shape[0] - (y2a - y1a) + x2b = shape[1] + y2b = shape[0] + if i == 1: # top right + x1a = xc + y1a = max(yc - shape[0], 0) + x2a = min(xc + shape[1], self.input_size * 2) + y2a = yc + x1b = 0 + y1b = shape[0] - (y2a - y1a) + x2b = min(shape[1], x2a - x1a) + y2b = shape[0] + if i == 2: # bottom left + x1a = max(xc - shape[1], 0) + y1a = yc + x2a = xc + y2a = min(self.input_size * 2, yc + shape[0]) + x1b = shape[1] - (x2a - x1a) + y1b = 0 + x2b = shape[1] + y2b = min(y2a - y1a, shape[0]) + if i == 3: # bottom right + x1a = xc + y1a = yc + x2a = min(xc + shape[1], self.input_size * 2) + y2a = min(self.input_size * 2, yc + shape[0]) + x1b = 0 + y1b = 0 + x2b = min(shape[1], x2a - x1a) + y2b = min(y2a - y1a, shape[0]) + + pad_w = x1a - x1b + pad_h = y1a - y1b + image4[y1a:y2a, x1a:x2a] = image[y1b:y2b, x1b:x2b] + + # Labels + label = self.labels[index].copy() + if len(label): + label[:, 1:] = wh2xy(label[:, 1:], shape[1], shape[0], pad_w, pad_h) + label4.append(label) + + # Concat/clip labels + label4 = numpy.concatenate(label4, 0) + for x in label4[:, 1:]: + numpy.clip(x, 0, 2 * self.input_size, out=x) + + # Augment + image4, label4 = random_perspective(image4, label4, params, border) + + return image4, label4 + + @staticmethod + def collate_fn(batch): + samples, cls, box, indices = zip(*batch) + + cls = torch.cat(cls, 0) + box = torch.cat(box, 0) + + new_indices = list(indices) + for i in range(len(indices)): + new_indices[i] += i + indices = torch.cat(new_indices, 0) + + targets = {'cls': cls, + 'box': box, + 'idx': indices} + return torch.stack(samples, 0), targets + + @staticmethod + def load_label(filenames): + path = f'{os.path.dirname(filenames[0])}.cache' + if os.path.exists(path): + return torch.load(path) + x = {} + for filename in filenames: + try: + # verify images + with open(filename, 'rb') as f: + image = Image.open(f) + image.verify() # PIL verify + shape = image.size # image size + assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels' + assert image.format.lower() in FORMATS, f'invalid image format {image.format}' + + # verify labels + a = f'{os.sep}images{os.sep}' + b = f'{os.sep}labels{os.sep}' + if os.path.isfile(b.join(filename.rsplit(a, 1)).rsplit('.', 1)[0] + '.txt'): + with open(b.join(filename.rsplit(a, 1)).rsplit('.', 1)[0] + '.txt') as f: + label = [x.split() for x in f.read().strip().splitlines() if len(x)] + label = numpy.array(label, dtype=numpy.float32) + nl = len(label) + if nl: + assert (label >= 0).all() + assert label.shape[1] == 5 + assert (label[:, 1:] <= 1).all() + _, i = numpy.unique(label, axis=0, return_index=True) + if len(i) < nl: # duplicate row check + label = label[i] # remove duplicates + else: + label = numpy.zeros((0, 5), dtype=numpy.float32) + else: + label = numpy.zeros((0, 5), dtype=numpy.float32) + if filename: + x[filename] = label + except FileNotFoundError: + pass + except AssertionError: + pass + torch.save(x, path) + return x + + +def wh2xy(x, w=640, h=640, pad_w=0, pad_h=0): + # Convert nx4 boxes + # from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + y = numpy.copy(x) + y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + pad_w # top left x + y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + pad_h # top left y + y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + pad_w # bottom right x + y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + pad_h # bottom right y + return y + + +def xy2wh(x, w, h): + # warning: inplace clip + x[:, [0, 2]] = x[:, [0, 2]].clip(0, w - 1E-3) # x1, x2 + x[:, [1, 3]] = x[:, [1, 3]].clip(0, h - 1E-3) # y1, y2 + + # Convert nx4 boxes + # from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right + y = numpy.copy(x) + y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w # x center + y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h # y center + y[:, 2] = (x[:, 2] - x[:, 0]) / w # width + y[:, 3] = (x[:, 3] - x[:, 1]) / h # height + return y + + +def resample(): + choices = (cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LINEAR, + cv2.INTER_NEAREST, + cv2.INTER_LANCZOS4) + return random.choice(seq=choices) + + +def augment_hsv(image, params): + # HSV color-space augmentation + h = params['hsv_h'] + s = params['hsv_s'] + v = params['hsv_v'] + + r = numpy.random.uniform(-1, 1, 3) * [h, s, v] + 1 + h, s, v = cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV)) + + x = numpy.arange(0, 256, dtype=r.dtype) + lut_h = ((x * r[0]) % 180).astype('uint8') + lut_s = numpy.clip(x * r[1], 0, 255).astype('uint8') + lut_v = numpy.clip(x * r[2], 0, 255).astype('uint8') + + hsv = cv2.merge((cv2.LUT(h, lut_h), cv2.LUT(s, lut_s), cv2.LUT(v, lut_v))) + cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR, dst=image) # no return needed + + +def resize(image, input_size, augment): + # Resize and pad image while meeting stride-multiple constraints + shape = image.shape[:2] # current shape [height, width] + + # Scale ratio (new / old) + r = min(input_size / shape[0], input_size / shape[1]) + if not augment: # only scale down, do not scale up (for better val mAP) + r = min(r, 1.0) + + # Compute padding + pad = int(round(shape[1] * r)), int(round(shape[0] * r)) + w = (input_size - pad[0]) / 2 + h = (input_size - pad[1]) / 2 + + if shape[::-1] != pad: # resize + image = cv2.resize(image, + dsize=pad, + interpolation=resample() if augment else cv2.INTER_LINEAR) + top, bottom = int(round(h - 0.1)), int(round(h + 0.1)) + left, right = int(round(w - 0.1)), int(round(w + 0.1)) + image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT) # add border + return image, (r, r), (w, h) + + +def candidates(box1, box2): + # box1(4,n), box2(4,n) + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + aspect_ratio = numpy.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio + return (w2 > 2) & (h2 > 2) & (w2 * h2 / (w1 * h1 + 1e-16) > 0.1) & (aspect_ratio < 100) + + +def random_perspective(image, label, params, border=(0, 0)): + h = image.shape[0] + border[0] * 2 + w = image.shape[1] + border[1] * 2 + + # Center + center = numpy.eye(3) + center[0, 2] = -image.shape[1] / 2 # x translation (pixels) + center[1, 2] = -image.shape[0] / 2 # y translation (pixels) + + # Perspective + perspective = numpy.eye(3) + + # Rotation and Scale + rotate = numpy.eye(3) + a = random.uniform(-params['degrees'], params['degrees']) + s = random.uniform(1 - params['scale'], 1 + params['scale']) + rotate[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + shear = numpy.eye(3) + shear[0, 1] = math.tan(random.uniform(-params['shear'], params['shear']) * math.pi / 180) + shear[1, 0] = math.tan(random.uniform(-params['shear'], params['shear']) * math.pi / 180) + + # Translation + translate = numpy.eye(3) + translate[0, 2] = random.uniform(0.5 - params['translate'], 0.5 + params['translate']) * w + translate[1, 2] = random.uniform(0.5 - params['translate'], 0.5 + params['translate']) * h + + # Combined rotation matrix, order of operations (right to left) is IMPORTANT + matrix = translate @ shear @ rotate @ perspective @ center + if (border[0] != 0) or (border[1] != 0) or (matrix != numpy.eye(3)).any(): # image changed + image = cv2.warpAffine(image, matrix[:2], dsize=(w, h), borderValue=(0, 0, 0)) + + # Transform label coordinates + n = len(label) + if n: + xy = numpy.ones((n * 4, 3)) + xy[:, :2] = label[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = xy @ matrix.T # transform + xy = xy[:, :2].reshape(n, 8) # perspective rescale or affine + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + box = numpy.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # clip + box[:, [0, 2]] = box[:, [0, 2]].clip(0, w) + box[:, [1, 3]] = box[:, [1, 3]].clip(0, h) + # filter candidates + indices = candidates(box1=label[:, 1:5].T * s, box2=box.T) + + label = label[indices] + label[:, 1:5] = box[indices] + + return image, label + + +def mix_up(image1, box1, image2, box2): + # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf + alpha = numpy.random.beta(32.0, 32.0) # mix-up ratio, alpha=beta=32.0 + image = (image1 * alpha + image2 * (1 - alpha)).astype(numpy.uint8) + box = numpy.concatenate((box1, box2), 0) + return image, box + + +class Albumentations: + def __init__(self): + self.transform = None + try: + import albumentations + + transforms = [albumentations.Blur(p=0.01), + albumentations.CLAHE(p=0.01), + albumentations.ToGray(p=0.01), + albumentations.MedianBlur(p=0.01)] + self.transform = albumentations.Compose(transforms, + albumentations.BboxParams('yolo', ['class_labels'])) + + except ImportError: # package not installed, skip + pass + + def __call__(self, image, box, cls): + if self.transform: + x = self.transform(image=image, + bboxes=box, + class_labels=cls) + image = x['image'] + box = numpy.array(x['bboxes']) + cls = numpy.array(x['class_labels']) + return image, box, cls diff --git a/my_pretrained_method/YOLOv8_human/utils/util.py b/my_pretrained_method/YOLOv8_human/utils/util.py new file mode 100644 index 0000000000..5118f5fe79 --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/utils/util.py @@ -0,0 +1,577 @@ +import copy +import math +import random +from time import time +import os +os.chdir('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human') +import sys +sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/YOLOv8_human') + +import numpy +import torch +import torchvision +from torch.nn.functional import cross_entropy + + +def setup_seed(): + """ + Setup random seed. + """ + random.seed(0) + numpy.random.seed(0) + torch.manual_seed(0) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +def setup_multi_processes(): + """ + Setup multi-processing environment variables. + """ + import cv2 + from os import environ + from platform import system + + # set multiprocess start method as `fork` to speed up the training + if system() != 'Windows': + torch.multiprocessing.set_start_method('fork', force=True) + + # disable opencv multithreading to avoid system being overloaded + cv2.setNumThreads(0) + + # setup OMP threads + if 'OMP_NUM_THREADS' not in environ: + environ['OMP_NUM_THREADS'] = '1' + + # setup MKL threads + if 'MKL_NUM_THREADS' not in environ: + environ['MKL_NUM_THREADS'] = '1' + + +def export_onnx(args): + import onnx # noqa + + inputs = ['images'] + outputs = ['outputs'] + dynamic = {'outputs': {0: 'batch', 1: 'anchors'}} + + m = torch.load('./weights/best.pt')['model'].float() + x = torch.zeros((1, 3, args.input_size, args.input_size)) + + torch.onnx.export(m.cpu(), x.cpu(), + './weights/best.onnx', + verbose=False, + opset_version=12, + # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False + do_constant_folding=True, + input_names=inputs, + output_names=outputs, + dynamic_axes=dynamic or None) + + # Checks + model_onnx = onnx.load('./weights/best.onnx') # load onnx model + onnx.checker.check_model(model_onnx) # check onnx model + + onnx.save(model_onnx, './weights/best.onnx') + # Inference example + # https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/autobackend.py + + +def wh2xy(x): + y = x.clone() if isinstance(x, torch.Tensor) else numpy.copy(x) + y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x + y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y + y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x + y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y + return y + + +def make_anchors(x, strides, offset=0.5): + anchors, stride_tensor = [], [] + for i, stride in enumerate(strides): + _, _, h, w = x[i].shape + sx = torch.arange(end=w, dtype=x[i].dtype, device=x[i].device) + offset # shift x + sy = torch.arange(end=h, dtype=x[i].dtype, device=x[i].device) + offset # shift y + sy, sx = torch.meshgrid(sy, sx) + anchors.append(torch.stack((sx, sy), -1).view(-1, 2)) + stride_tensor.append(torch.full((h * w, 1), stride, dtype=x[i].dtype, device=x[i].device)) + return torch.cat(anchors), torch.cat(stride_tensor) + + +def compute_metric(output, target, iou_v): + # intersection(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) + (a1, a2) = target[:, 1:].unsqueeze(1).chunk(2, 2) + (b1, b2) = output[:, :4].unsqueeze(0).chunk(2, 2) + intersection = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2) + # IoU = intersection / (area1 + area2 - intersection) + iou = intersection / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - intersection + 1e-7) + + correct = numpy.zeros((output.shape[0], iou_v.shape[0])) + correct = correct.astype(bool) + for i in range(len(iou_v)): + # IoU > threshold and classes match + x = torch.where((iou >= iou_v[i]) & (target[:, 0:1] == output[:, 5])) + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), + iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[numpy.unique(matches[:, 1], return_index=True)[1]] + matches = matches[numpy.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=output.device) + + +def non_max_suppression(outputs, conf_threshold, iou_threshold): + max_wh = 7680 + max_det = 300 + max_nms = 30000 + + bs = outputs.shape[0] # batch size + nc = outputs.shape[1] - 4 # number of classes + xc = outputs[:, 4:4 + nc].amax(1) > conf_threshold # candidates + + start = time() + limit = 0.5 + 0.05 * bs # seconds to quit after + + output = [torch.zeros((0, 6), device=outputs.device)] * bs + for index, x in enumerate(outputs): # image index, image inference + x = x.transpose(0, -1)[xc[index]] # confidence + + # If none remain process next image + if not x.shape[0]: + continue + + box, cls = x.split((4, nc), 1) + box = wh2xy(box) # (cx, cy, w, h) to (x1, y1, x2, y2) + if nc > 1: + i, j = (cls > conf_threshold).nonzero(as_tuple=False).T + x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float()), 1) + else: # best class only + conf, j = cls.max(1, keepdim=True) + x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_threshold] + + if not x.shape[0]: # no boxes + continue + x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes + + # Batched NMS + c = x[:, 5:6] * max_wh # classes + boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores + i = torchvision.ops.nms(boxes, scores, iou_threshold) # NMS + i = i[:max_det] # limit detections + + output[index] = x[i] + if (time() - start) > limit: + break # time limit exceeded + + return output + + +def smooth(y, f=0.05): + # Box filter of fraction f + nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd) + p = numpy.ones(nf // 2) # ones padding + yp = numpy.concatenate((p * y[0], y, p * y[-1]), 0) # y padded + return numpy.convolve(yp, numpy.ones(nf) / nf, mode='valid') # y-smoothed + + +def compute_ap(tp, conf, pred_cls, target_cls, eps=1e-16): + """ + Compute the average precision, given the recall and precision curves. + Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (nparray, nx1 or nx10). + conf: Object-ness value from 0-1 (nparray). + pred_cls: Predicted object classes (nparray). + target_cls: True object classes (nparray). + # Returns + The average precision + """ + # Sort by object-ness + i = numpy.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes, nt = numpy.unique(target_cls, return_counts=True) + nc = unique_classes.shape[0] # number of classes, number of detections + + # Create Precision-Recall curve and compute AP for each class + p = numpy.zeros((nc, 1000)) + r = numpy.zeros((nc, 1000)) + ap = numpy.zeros((nc, tp.shape[1])) + px, py = numpy.linspace(0, 1, 1000), [] # for plotting + for ci, c in enumerate(unique_classes): + i = pred_cls == c + nl = nt[ci] # number of labels + no = i.sum() # number of outputs + if no == 0 or nl == 0: + continue + + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum(0) + tpc = tp[i].cumsum(0) + + # Recall + recall = tpc / (nl + eps) # recall curve + # negative x, xp because xp decreases + r[ci] = numpy.interp(-px, -conf[i], recall[:, 0], left=0) + + # Precision + precision = tpc / (tpc + fpc) # precision curve + p[ci] = numpy.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score + + # AP from recall-precision curve + for j in range(tp.shape[1]): + m_rec = numpy.concatenate(([0.0], recall[:, j], [1.0])) + m_pre = numpy.concatenate(([1.0], precision[:, j], [0.0])) + + # Compute the precision envelope + m_pre = numpy.flip(numpy.maximum.accumulate(numpy.flip(m_pre))) + + # Integrate area under curve + x = numpy.linspace(0, 1, 101) # 101-point interp (COCO) + ap[ci, j] = numpy.trapz(numpy.interp(x, m_rec, m_pre), x) # integrate + + # Compute F1 (harmonic mean of precision and recall) + f1 = 2 * p * r / (p + r + eps) + + i = smooth(f1.mean(0), 0.1).argmax() # max F1 index + p, r, f1 = p[:, i], r[:, i], f1[:, i] + tp = (r * nt).round() # true positives + fp = (tp / (p + eps) - tp).round() # false positives + ap50, ap = ap[:, 0], ap.mean(1) # AP@0.5, AP@0.5:0.95 + m_pre, m_rec = p.mean(), r.mean() + map50, mean_ap = ap50.mean(), ap.mean() + return tp, fp, m_pre, m_rec, map50, mean_ap + + +def compute_iou(box1, box2, eps=1e-7): + # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) + b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) + w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps + w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + + # Intersection area + inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \ + (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0) + + # Union Area + union = w1 * h1 + w2 * h2 - inter + eps + + # IoU + iou = inter / union + cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width + ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height + c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared + rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 + # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 + v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) + with torch.no_grad(): + alpha = v / (v - iou + (1 + eps)) + return iou - (rho2 / c2 + v * alpha) # CIoU + + +def strip_optimizer(filename): + x = torch.load(filename, map_location=torch.device('cpu')) + x['model'].half() # to FP16 + for p in x['model'].parameters(): + p.requires_grad = False + torch.save(x, filename) + + +def clip_gradients(model, max_norm=10.0): + parameters = model.parameters() + torch.nn.utils.clip_grad_norm_(parameters, max_norm=max_norm) + + +def load_weight(ckpt, model): + dst = model.state_dict() + src = torch.load(ckpt, 'cpu')['model'].float().state_dict() + ckpt = {} + for k, v in src.items(): + if k in dst and v.shape == dst[k].shape: + ckpt[k] = v + model.load_state_dict(state_dict=ckpt, strict=False) + return model + + +class EMA: + """ + Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models + Keeps a moving average of everything in the model state_dict (parameters and buffers) + For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + """ + + def __init__(self, model, decay=0.9999, tau=2000, updates=0): + # Create EMA + self.ema = copy.deepcopy(model).eval() # FP32 EMA + self.updates = updates # number of EMA updates + # decay exponential ramp (to help early epochs) + self.decay = lambda x: decay * (1 - math.exp(-x / tau)) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def update(self, model): + if hasattr(model, 'module'): + model = model.module + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay(self.updates) + + msd = model.state_dict() # model state_dict + for k, v in self.ema.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1 - d) * msd[k].detach() + + +class AverageMeter: + def __init__(self): + self.num = 0 + self.sum = 0 + self.avg = 0 + + def update(self, v, n): + if not math.isnan(float(v)): + self.num = self.num + n + self.sum = self.sum + v * n + self.avg = self.sum / self.num + + +class Assigner(torch.nn.Module): + def __init__(self, top_k=13, nc=80, alpha=1.0, beta=6.0, eps=1E-9): + super().__init__() + self.top_k = top_k + self.nc = nc + self.alpha = alpha + self.beta = beta + self.eps = eps + + @torch.no_grad() + def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt): + batch_size = pd_scores.size(0) + num_max_boxes = gt_bboxes.size(1) + + if num_max_boxes == 0: + device = gt_bboxes.device + return (torch.full_like(pd_scores[..., 0], self.nc).to(device), + torch.zeros_like(pd_bboxes).to(device), + torch.zeros_like(pd_scores).to(device), + torch.zeros_like(pd_scores[..., 0]).to(device), + torch.zeros_like(pd_scores[..., 0]).to(device)) + + num_anchors = anc_points.shape[0] + shape = gt_bboxes.shape + lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) + mask_in_gts = torch.cat((anc_points[None] - lt, rb - anc_points[None]), dim=2) + mask_in_gts = mask_in_gts.view(shape[0], shape[1], num_anchors, -1).amin(3).gt_(self.eps) + na = pd_bboxes.shape[-2] + gt_mask = (mask_in_gts * mask_gt).bool() # b, max_num_obj, h*w + overlaps = torch.zeros([batch_size, num_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device) + bbox_scores = torch.zeros([batch_size, num_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device) + + ind = torch.zeros([2, batch_size, num_max_boxes], dtype=torch.long) # 2, b, max_num_obj + ind[0] = torch.arange(end=batch_size).view(-1, 1).expand(-1, num_max_boxes) # b, max_num_obj + ind[1] = gt_labels.squeeze(-1) # b, max_num_obj + bbox_scores[gt_mask] = pd_scores[ind[0], :, ind[1]][gt_mask] # b, max_num_obj, h*w + + pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, num_max_boxes, -1, -1)[gt_mask] + gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[gt_mask] + overlaps[gt_mask] = compute_iou(gt_boxes, pd_boxes).squeeze(-1).clamp_(0) + + align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) + + top_k_mask = mask_gt.expand(-1, -1, self.top_k).bool() + top_k_metrics, top_k_indices = torch.topk(align_metric, self.top_k, dim=-1, largest=True) + if top_k_mask is None: + top_k_mask = (top_k_metrics.max(-1, keepdim=True)[0] > self.eps).expand_as(top_k_indices) + top_k_indices.masked_fill_(~top_k_mask, 0) + + mask_top_k = torch.zeros(align_metric.shape, dtype=torch.int8, device=top_k_indices.device) + ones = torch.ones_like(top_k_indices[:, :, :1], dtype=torch.int8, device=top_k_indices.device) + for k in range(self.top_k): + mask_top_k.scatter_add_(-1, top_k_indices[:, :, k:k + 1], ones) + mask_top_k.masked_fill_(mask_top_k > 1, 0) + mask_top_k = mask_top_k.to(align_metric.dtype) + mask_pos = mask_top_k * mask_in_gts * mask_gt + + fg_mask = mask_pos.sum(-2) + if fg_mask.max() > 1: + mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, num_max_boxes, -1) + max_overlaps_idx = overlaps.argmax(1) + + is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device) + is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1) + + mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float() + fg_mask = mask_pos.sum(-2) + target_gt_idx = mask_pos.argmax(-2) + + # Assigned target + index = torch.arange(end=batch_size, dtype=torch.int64, device=gt_labels.device)[..., None] + target_index = target_gt_idx + index * num_max_boxes + target_labels = gt_labels.long().flatten()[target_index] + + target_bboxes = gt_bboxes.view(-1, gt_bboxes.shape[-1])[target_index] + + # Assigned target scores + target_labels.clamp_(0) + + target_scores = torch.zeros((target_labels.shape[0], target_labels.shape[1], self.nc), + dtype=torch.int64, + device=target_labels.device) + target_scores.scatter_(2, target_labels.unsqueeze(-1), 1) + + fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.nc) + target_scores = torch.where(fg_scores_mask > 0, target_scores, 0) + + # Normalize + align_metric *= mask_pos + pos_align_metrics = align_metric.amax(dim=-1, keepdim=True) + pos_overlaps = (overlaps * mask_pos).amax(dim=-1, keepdim=True) + norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1) + target_scores = target_scores * norm_align_metric + + return target_bboxes, target_scores, fg_mask.bool() + + +class BoxLoss(torch.nn.Module): + def __init__(self, dfl_ch): + super().__init__() + self.dfl_ch = dfl_ch + + def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): + # IoU loss + weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1) + iou = compute_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask]) + loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum + + # DFL loss + a, b = target_bboxes.chunk(2, -1) + target = torch.cat((anchor_points - a, b - anchor_points), -1) + target = target.clamp(0, self.dfl_ch - 0.01) + loss_dfl = self.df_loss(pred_dist[fg_mask].view(-1, self.dfl_ch + 1), target[fg_mask]) + loss_dfl = (loss_dfl * weight).sum() / target_scores_sum + + return loss_iou, loss_dfl + + @staticmethod + def df_loss(pred_dist, target): + # Distribution Focal Loss (DFL) + # https://ieeexplore.ieee.org/document/9792391 + tl = target.long() # target left + tr = tl + 1 # target right + wl = tr - target # weight left + wr = 1 - wl # weight right + left_loss = cross_entropy(pred_dist, tl.view(-1), reduction='none').view(tl.shape) + right_loss = cross_entropy(pred_dist, tr.view(-1), reduction='none').view(tl.shape) + return (left_loss * wl + right_loss * wr).mean(-1, keepdim=True) + + +class ComputeLoss: + def __init__(self, model, params): + if hasattr(model, 'module'): + model = model.module + + device = next(model.parameters()).device + + m = model.head # Head() module + + self.params = params + self.stride = m.stride + self.nc = m.nc + self.no = m.no + self.reg_max = m.ch + self.device = device + + self.box_loss = BoxLoss(m.ch - 1).to(device) + self.cls_loss = torch.nn.BCEWithLogitsLoss(reduction='none') + self.assigner = Assigner(top_k=10, nc=self.nc, alpha=0.5, beta=6.0) + + self.project = torch.arange(m.ch, dtype=torch.float, device=device) + + def box_decode(self, anchor_points, pred_dist): + b, a, c = pred_dist.shape + pred_dist = pred_dist.view(b, a, 4, c // 4) + pred_dist = pred_dist.softmax(3) + pred_dist = pred_dist.matmul(self.project.type(pred_dist.dtype)) + lt, rb = pred_dist.chunk(2, -1) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + return torch.cat((x1y1, x2y2), -1) + + def __call__(self, outputs, targets): + loss_cls = torch.zeros(1, device=self.device) + loss_box = torch.zeros(1, device=self.device) + loss_dfl = torch.zeros(1, device=self.device) + + x = torch.cat([i.view(outputs[0].shape[0], self.no, -1) for i in outputs], 2) + pred_distri, pred_scores = x.split((self.reg_max * 4, self.nc), 1) + + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + + data_type = pred_scores.dtype + batch_size = pred_scores.shape[0] + input_size = torch.tensor(outputs[0].shape[2:], device=self.device, dtype=data_type) * self.stride[0] + anchor_points, stride_tensor = make_anchors(outputs, self.stride, offset=0.5) + + idx = targets['idx'].view(-1, 1) + cls = targets['cls'].view(-1, 1) + box = targets['box'] + + targets = torch.cat((idx, cls, box), dim=1).to(self.device) + if targets.shape[0] == 0: + gt = torch.zeros(batch_size, 0, 5, device=self.device) + else: + i = targets[:, 0] + _, counts = i.unique(return_counts=True) + counts = counts.to(dtype=torch.int32) + gt = torch.zeros(batch_size, counts.max(), 5, device=self.device) + for j in range(batch_size): + matches = i == j + n = matches.sum() + if n: + gt[j, :n] = targets[matches, 1:] + x = gt[..., 1:5].mul_(input_size[[1, 0, 1, 0]]) + y = torch.empty_like(x) + dw = x[..., 2] / 2 # half-width + dh = x[..., 3] / 2 # half-height + y[..., 0] = x[..., 0] - dw # top left x + y[..., 1] = x[..., 1] - dh # top left y + y[..., 2] = x[..., 0] + dw # bottom right x + y[..., 3] = x[..., 1] + dh # bottom right y + gt[..., 1:5] = y + gt_labels, gt_bboxes = gt.split((1, 4), 2) + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + pred_bboxes = self.box_decode(anchor_points, pred_distri) + assigned_targets = self.assigner(pred_scores.detach().sigmoid(), + (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + target_bboxes, target_scores, fg_mask = assigned_targets + + target_scores_sum = max(target_scores.sum(), 1) + + loss_cls = self.cls_loss(pred_scores, target_scores.to(data_type)).sum() / target_scores_sum # BCE + + # Box loss + if fg_mask.sum(): + target_bboxes /= stride_tensor + loss_box, loss_dfl = self.box_loss(pred_distri, + pred_bboxes, + anchor_points, + target_bboxes, + target_scores, + target_scores_sum, fg_mask) + + loss_box *= self.params['box'] # box gain + loss_cls *= self.params['cls'] # cls gain + loss_dfl *= self.params['dfl'] # dfl gain + + return loss_box, loss_cls, loss_dfl diff --git a/my_pretrained_method/YOLOv8_human/weights/best.pt b/my_pretrained_method/YOLOv8_human/weights/best.pt new file mode 100644 index 0000000000..360998461e Binary files /dev/null and b/my_pretrained_method/YOLOv8_human/weights/best.pt differ diff --git a/my_pretrained_method/YOLOv8_human/weights/last.pt b/my_pretrained_method/YOLOv8_human/weights/last.pt new file mode 100644 index 0000000000..0280f9c005 Binary files /dev/null and b/my_pretrained_method/YOLOv8_human/weights/last.pt differ diff --git a/my_pretrained_method/YOLOv8_human/weights/step.csv b/my_pretrained_method/YOLOv8_human/weights/step.csv new file mode 100644 index 0000000000..3687b3c88e --- /dev/null +++ b/my_pretrained_method/YOLOv8_human/weights/step.csv @@ -0,0 +1,601 @@ +epoch,box,dfl,cls,Recall,Precision,mAP@50,mAP +001,1.572,1.572,1.282,0.618,0.618,0.724,0.413 +002,1.502,1.502,1.050,0.621,0.621,0.732,0.413 +003,1.528,1.528,1.045,0.601,0.601,0.709,0.398 +004,1.532,1.532,1.036,0.617,0.617,0.725,0.407 +005,1.509,1.509,0.998,0.638,0.638,0.746,0.430 +006,1.491,1.491,0.973,0.623,0.623,0.743,0.429 +007,1.473,1.473,0.950,0.630,0.630,0.748,0.439 +008,1.456,1.456,0.934,0.630,0.630,0.750,0.438 +009,1.449,1.449,0.921,0.645,0.645,0.757,0.447 +010,1.438,1.438,0.912,0.655,0.655,0.770,0.454 +011,1.426,1.426,0.901,0.656,0.656,0.772,0.458 +012,1.421,1.421,0.893,0.662,0.662,0.776,0.461 +013,1.413,1.413,0.887,0.667,0.667,0.780,0.463 +014,1.410,1.410,0.878,0.666,0.666,0.782,0.467 +015,1.404,1.404,0.877,0.674,0.674,0.785,0.471 +016,1.396,1.396,0.868,0.666,0.666,0.784,0.473 +017,1.390,1.390,0.862,0.668,0.668,0.788,0.475 +018,1.392,1.392,0.862,0.678,0.678,0.789,0.475 +019,1.380,1.380,0.854,0.680,0.680,0.793,0.481 +020,1.377,1.377,0.852,0.680,0.680,0.793,0.479 +021,1.378,1.378,0.851,0.680,0.680,0.792,0.480 +022,1.371,1.371,0.844,0.682,0.682,0.794,0.482 +023,1.373,1.373,0.842,0.685,0.685,0.796,0.484 +024,1.366,1.366,0.836,0.686,0.686,0.798,0.487 +025,1.366,1.366,0.838,0.683,0.683,0.797,0.488 +026,1.360,1.360,0.834,0.687,0.687,0.801,0.490 +027,1.363,1.363,0.834,0.686,0.686,0.799,0.490 +028,1.359,1.359,0.833,0.691,0.691,0.803,0.491 +029,1.355,1.355,0.827,0.683,0.683,0.801,0.493 +030,1.354,1.354,0.826,0.689,0.689,0.804,0.494 +031,1.350,1.350,0.822,0.692,0.692,0.805,0.497 +032,1.351,1.351,0.822,0.697,0.697,0.808,0.496 +033,1.343,1.343,0.818,0.695,0.695,0.807,0.498 +034,1.341,1.341,0.817,0.694,0.694,0.808,0.498 +035,1.342,1.342,0.814,0.698,0.698,0.808,0.498 +036,1.342,1.342,0.814,0.701,0.701,0.809,0.501 +037,1.341,1.341,0.812,0.702,0.702,0.810,0.501 +038,1.344,1.344,0.812,0.702,0.702,0.812,0.503 +039,1.340,1.340,0.812,0.699,0.699,0.812,0.503 +040,1.338,1.338,0.810,0.702,0.702,0.813,0.504 +041,1.329,1.329,0.804,0.698,0.698,0.811,0.505 +042,1.331,1.331,0.803,0.704,0.704,0.813,0.505 +043,1.329,1.329,0.800,0.705,0.705,0.814,0.506 +044,1.329,1.329,0.801,0.704,0.704,0.814,0.507 +045,1.332,1.332,0.804,0.706,0.706,0.815,0.509 +046,1.327,1.327,0.800,0.702,0.702,0.815,0.509 +047,1.326,1.326,0.799,0.709,0.709,0.817,0.510 +048,1.323,1.323,0.796,0.705,0.705,0.816,0.511 +049,1.328,1.328,0.798,0.707,0.707,0.818,0.511 +050,1.319,1.319,0.793,0.709,0.709,0.818,0.511 +051,1.325,1.325,0.794,0.710,0.710,0.819,0.513 +052,1.322,1.322,0.791,0.712,0.712,0.820,0.513 +053,1.320,1.320,0.791,0.712,0.712,0.819,0.513 +054,1.323,1.323,0.790,0.714,0.714,0.820,0.514 +055,1.314,1.314,0.787,0.712,0.712,0.821,0.514 +056,1.319,1.319,0.789,0.711,0.711,0.821,0.515 +057,1.319,1.319,0.790,0.713,0.713,0.822,0.516 +058,1.312,1.312,0.786,0.712,0.712,0.821,0.516 +059,1.310,1.310,0.785,0.711,0.711,0.822,0.516 +060,1.311,1.311,0.784,0.712,0.712,0.822,0.517 +061,1.312,1.312,0.785,0.714,0.714,0.823,0.517 +062,1.311,1.311,0.784,0.714,0.714,0.822,0.517 +063,1.309,1.309,0.783,0.715,0.715,0.823,0.517 +064,1.310,1.310,0.783,0.716,0.716,0.824,0.518 +065,1.306,1.306,0.781,0.716,0.716,0.825,0.519 +066,1.310,1.310,0.781,0.716,0.716,0.824,0.519 +067,1.308,1.308,0.782,0.716,0.716,0.824,0.519 +068,1.310,1.310,0.783,0.716,0.716,0.824,0.519 +069,1.305,1.305,0.779,0.717,0.717,0.825,0.520 +070,1.304,1.304,0.776,0.716,0.716,0.824,0.520 +071,1.298,1.298,0.774,0.718,0.718,0.826,0.521 +072,1.300,1.300,0.773,0.718,0.718,0.826,0.521 +073,1.307,1.307,0.777,0.720,0.720,0.826,0.521 +074,1.301,1.301,0.774,0.719,0.719,0.827,0.521 +075,1.300,1.300,0.772,0.719,0.719,0.827,0.522 +076,1.302,1.302,0.772,0.718,0.718,0.827,0.522 +077,1.302,1.302,0.772,0.719,0.719,0.827,0.522 +078,1.305,1.305,0.777,0.719,0.719,0.827,0.522 +079,1.299,1.299,0.771,0.719,0.719,0.827,0.523 +080,1.301,1.301,0.772,0.720,0.720,0.827,0.522 +081,1.300,1.300,0.770,0.720,0.720,0.827,0.523 +082,1.296,1.296,0.770,0.719,0.719,0.827,0.523 +083,1.295,1.295,0.769,0.718,0.718,0.827,0.523 +084,1.298,1.298,0.771,0.720,0.720,0.828,0.523 +085,1.295,1.295,0.767,0.720,0.720,0.828,0.523 +086,1.298,1.298,0.770,0.721,0.721,0.828,0.524 +087,1.293,1.293,0.768,0.721,0.721,0.828,0.524 +088,1.299,1.299,0.768,0.720,0.720,0.828,0.524 +089,1.299,1.299,0.769,0.720,0.720,0.828,0.524 +090,1.294,1.294,0.766,0.721,0.721,0.829,0.524 +091,1.293,1.293,0.766,0.721,0.721,0.829,0.524 +092,1.293,1.293,0.765,0.722,0.722,0.829,0.525 +093,1.290,1.290,0.763,0.721,0.721,0.829,0.524 +094,1.294,1.294,0.767,0.722,0.722,0.829,0.525 +095,1.289,1.289,0.764,0.722,0.722,0.829,0.525 +096,1.285,1.285,0.762,0.722,0.722,0.829,0.525 +097,1.288,1.288,0.763,0.722,0.722,0.829,0.525 +098,1.289,1.289,0.761,0.722,0.722,0.829,0.525 +099,1.286,1.286,0.760,0.723,0.723,0.829,0.525 +100,1.288,1.288,0.762,0.722,0.722,0.829,0.525 +101,1.285,1.285,0.761,0.722,0.722,0.830,0.525 +102,1.288,1.288,0.761,0.723,0.723,0.830,0.526 +103,1.286,1.286,0.760,0.723,0.723,0.830,0.526 +104,1.286,1.286,0.761,0.723,0.723,0.830,0.526 +105,1.284,1.284,0.755,0.724,0.724,0.830,0.526 +106,1.289,1.289,0.761,0.724,0.724,0.830,0.526 +107,1.284,1.284,0.757,0.724,0.724,0.830,0.526 +108,1.284,1.284,0.758,0.723,0.723,0.830,0.526 +109,1.286,1.286,0.759,0.724,0.724,0.830,0.526 +110,1.282,1.282,0.758,0.724,0.724,0.830,0.526 +111,1.277,1.277,0.754,0.724,0.724,0.830,0.526 +112,1.281,1.281,0.753,0.724,0.724,0.830,0.526 +113,1.289,1.289,0.758,0.724,0.724,0.831,0.526 +114,1.282,1.282,0.755,0.724,0.724,0.831,0.527 +115,1.277,1.277,0.754,0.724,0.724,0.831,0.527 +116,1.277,1.277,0.752,0.724,0.724,0.831,0.527 +117,1.284,1.284,0.754,0.723,0.723,0.831,0.527 +118,1.278,1.278,0.753,0.723,0.723,0.831,0.527 +119,1.278,1.278,0.752,0.724,0.724,0.831,0.527 +120,1.275,1.275,0.749,0.724,0.724,0.831,0.527 +121,1.274,1.274,0.751,0.724,0.724,0.831,0.527 +122,1.279,1.279,0.752,0.724,0.724,0.831,0.527 +123,1.277,1.277,0.751,0.724,0.724,0.831,0.527 +124,1.280,1.280,0.750,0.725,0.725,0.831,0.527 +125,1.278,1.278,0.752,0.725,0.725,0.831,0.527 +126,1.279,1.279,0.751,0.725,0.725,0.831,0.527 +127,1.276,1.276,0.752,0.725,0.725,0.831,0.528 +128,1.269,1.269,0.749,0.724,0.724,0.831,0.528 +129,1.275,1.275,0.748,0.725,0.725,0.831,0.528 +130,1.278,1.278,0.753,0.725,0.725,0.831,0.528 +131,1.272,1.272,0.748,0.725,0.725,0.831,0.528 +132,1.272,1.272,0.743,0.724,0.724,0.831,0.528 +133,1.274,1.274,0.750,0.723,0.723,0.831,0.528 +134,1.274,1.274,0.748,0.723,0.723,0.831,0.528 +135,1.269,1.269,0.745,0.723,0.723,0.831,0.528 +136,1.272,1.272,0.745,0.723,0.723,0.831,0.528 +137,1.275,1.275,0.747,0.723,0.723,0.831,0.528 +138,1.272,1.272,0.744,0.723,0.723,0.831,0.528 +139,1.269,1.269,0.746,0.723,0.723,0.831,0.528 +140,1.269,1.269,0.742,0.723,0.723,0.831,0.528 +141,1.268,1.268,0.743,0.722,0.722,0.832,0.528 +142,1.272,1.272,0.746,0.723,0.723,0.831,0.528 +143,1.273,1.273,0.747,0.723,0.723,0.832,0.528 +144,1.268,1.268,0.743,0.723,0.723,0.831,0.528 +145,1.271,1.271,0.744,0.723,0.723,0.832,0.528 +146,1.267,1.267,0.744,0.723,0.723,0.832,0.528 +147,1.265,1.265,0.743,0.724,0.724,0.832,0.528 +148,1.266,1.266,0.744,0.724,0.724,0.832,0.528 +149,1.267,1.267,0.744,0.724,0.724,0.832,0.528 +150,1.266,1.266,0.741,0.724,0.724,0.832,0.528 +151,1.269,1.269,0.741,0.724,0.724,0.832,0.528 +152,1.274,1.274,0.745,0.724,0.724,0.832,0.528 +153,1.265,1.265,0.743,0.724,0.724,0.832,0.528 +154,1.269,1.269,0.743,0.724,0.724,0.832,0.528 +155,1.266,1.266,0.741,0.724,0.724,0.832,0.528 +156,1.264,1.264,0.741,0.724,0.724,0.832,0.528 +157,1.266,1.266,0.740,0.724,0.724,0.832,0.528 +158,1.268,1.268,0.743,0.724,0.724,0.832,0.528 +159,1.265,1.265,0.740,0.724,0.724,0.832,0.529 +160,1.266,1.266,0.745,0.724,0.724,0.832,0.529 +161,1.264,1.264,0.741,0.724,0.724,0.832,0.529 +162,1.266,1.266,0.739,0.724,0.724,0.832,0.529 +163,1.264,1.264,0.740,0.724,0.724,0.832,0.529 +164,1.262,1.262,0.740,0.724,0.724,0.832,0.529 +165,1.268,1.268,0.740,0.724,0.724,0.832,0.529 +166,1.269,1.269,0.741,0.725,0.725,0.832,0.529 +167,1.263,1.263,0.741,0.724,0.724,0.832,0.529 +168,1.265,1.265,0.740,0.724,0.724,0.832,0.529 +169,1.257,1.257,0.733,0.724,0.724,0.832,0.529 +170,1.266,1.266,0.741,0.724,0.724,0.832,0.529 +171,1.262,1.262,0.736,0.724,0.724,0.832,0.529 +172,1.259,1.259,0.736,0.725,0.725,0.832,0.529 +173,1.259,1.259,0.736,0.724,0.724,0.832,0.529 +174,1.257,1.257,0.739,0.725,0.725,0.832,0.529 +175,1.259,1.259,0.733,0.725,0.725,0.832,0.529 +176,1.258,1.258,0.736,0.726,0.726,0.832,0.529 +177,1.264,1.264,0.739,0.725,0.725,0.832,0.529 +178,1.258,1.258,0.735,0.725,0.725,0.832,0.529 +179,1.256,1.256,0.731,0.726,0.726,0.832,0.530 +180,1.257,1.257,0.733,0.725,0.725,0.832,0.530 +181,1.257,1.257,0.732,0.726,0.726,0.833,0.530 +182,1.257,1.257,0.734,0.726,0.726,0.833,0.530 +183,1.255,1.255,0.732,0.726,0.726,0.833,0.530 +184,1.260,1.260,0.737,0.726,0.726,0.833,0.530 +185,1.257,1.257,0.734,0.726,0.726,0.833,0.530 +186,1.253,1.253,0.734,0.726,0.726,0.833,0.530 +187,1.252,1.252,0.731,0.726,0.726,0.833,0.530 +188,1.254,1.254,0.730,0.726,0.726,0.833,0.530 +189,1.260,1.260,0.735,0.726,0.726,0.833,0.530 +190,1.258,1.258,0.732,0.726,0.726,0.833,0.530 +191,1.247,1.247,0.728,0.727,0.727,0.833,0.530 +192,1.255,1.255,0.733,0.727,0.727,0.833,0.530 +193,1.257,1.257,0.732,0.727,0.727,0.833,0.530 +194,1.257,1.257,0.729,0.727,0.727,0.833,0.530 +195,1.252,1.252,0.731,0.727,0.727,0.833,0.530 +196,1.250,1.250,0.730,0.727,0.727,0.833,0.530 +197,1.254,1.254,0.732,0.727,0.727,0.833,0.530 +198,1.256,1.256,0.730,0.727,0.727,0.833,0.530 +199,1.254,1.254,0.730,0.727,0.727,0.833,0.530 +200,1.256,1.256,0.731,0.727,0.727,0.833,0.530 +201,1.251,1.251,0.728,0.727,0.727,0.833,0.531 +202,1.256,1.256,0.730,0.726,0.726,0.833,0.531 +203,1.249,1.249,0.727,0.726,0.726,0.833,0.531 +204,1.250,1.250,0.726,0.726,0.726,0.833,0.531 +205,1.251,1.251,0.728,0.726,0.726,0.833,0.531 +206,1.253,1.253,0.729,0.726,0.726,0.833,0.531 +207,1.252,1.252,0.730,0.727,0.727,0.833,0.531 +208,1.249,1.249,0.727,0.726,0.726,0.833,0.531 +209,1.250,1.250,0.729,0.726,0.726,0.833,0.531 +210,1.248,1.248,0.722,0.728,0.728,0.833,0.531 +211,1.252,1.252,0.727,0.727,0.727,0.833,0.531 +212,1.249,1.249,0.726,0.728,0.728,0.833,0.531 +213,1.246,1.246,0.726,0.728,0.728,0.833,0.531 +214,1.248,1.248,0.725,0.728,0.728,0.834,0.531 +215,1.245,1.245,0.724,0.728,0.728,0.834,0.531 +216,1.245,1.245,0.722,0.728,0.728,0.834,0.531 +217,1.248,1.248,0.723,0.728,0.728,0.834,0.531 +218,1.248,1.248,0.723,0.728,0.728,0.834,0.531 +219,1.245,1.245,0.723,0.728,0.728,0.834,0.531 +220,1.247,1.247,0.724,0.728,0.728,0.834,0.531 +221,1.244,1.244,0.722,0.728,0.728,0.834,0.531 +222,1.247,1.247,0.722,0.728,0.728,0.834,0.531 +223,1.248,1.248,0.724,0.728,0.728,0.834,0.531 +224,1.247,1.247,0.725,0.728,0.728,0.834,0.531 +225,1.246,1.246,0.722,0.728,0.728,0.834,0.531 +226,1.245,1.245,0.722,0.729,0.729,0.834,0.531 +227,1.244,1.244,0.724,0.728,0.728,0.834,0.531 +228,1.244,1.244,0.722,0.729,0.729,0.834,0.531 +229,1.242,1.242,0.720,0.729,0.729,0.834,0.531 +230,1.243,1.243,0.721,0.729,0.729,0.834,0.531 +231,1.245,1.245,0.721,0.729,0.729,0.834,0.531 +232,1.242,1.242,0.721,0.730,0.730,0.834,0.531 +233,1.246,1.246,0.722,0.730,0.730,0.834,0.532 +234,1.245,1.245,0.722,0.730,0.730,0.834,0.532 +235,1.240,1.240,0.718,0.730,0.730,0.834,0.532 +236,1.243,1.243,0.720,0.729,0.729,0.834,0.532 +237,1.239,1.239,0.718,0.730,0.730,0.834,0.532 +238,1.240,1.240,0.717,0.729,0.729,0.834,0.532 +239,1.241,1.241,0.718,0.730,0.730,0.834,0.532 +240,1.244,1.244,0.720,0.730,0.730,0.834,0.532 +241,1.241,1.241,0.718,0.730,0.730,0.834,0.532 +242,1.240,1.240,0.719,0.731,0.731,0.834,0.532 +243,1.241,1.241,0.720,0.731,0.731,0.834,0.532 +244,1.243,1.243,0.721,0.730,0.730,0.834,0.532 +245,1.239,1.239,0.719,0.731,0.731,0.834,0.532 +246,1.237,1.237,0.717,0.730,0.730,0.834,0.532 +247,1.237,1.237,0.719,0.730,0.730,0.834,0.532 +248,1.240,1.240,0.723,0.730,0.730,0.834,0.532 +249,1.238,1.238,0.715,0.730,0.730,0.834,0.533 +250,1.239,1.239,0.719,0.730,0.730,0.834,0.532 +251,1.238,1.238,0.718,0.730,0.730,0.835,0.533 +252,1.237,1.237,0.715,0.729,0.729,0.834,0.533 +253,1.238,1.238,0.715,0.730,0.730,0.834,0.533 +254,1.237,1.237,0.717,0.729,0.729,0.835,0.533 +255,1.235,1.235,0.714,0.729,0.729,0.835,0.533 +256,1.239,1.239,0.717,0.729,0.729,0.835,0.533 +257,1.238,1.238,0.719,0.730,0.730,0.835,0.533 +258,1.236,1.236,0.714,0.730,0.730,0.835,0.533 +259,1.237,1.237,0.714,0.730,0.730,0.835,0.533 +260,1.237,1.237,0.714,0.730,0.730,0.835,0.533 +261,1.237,1.237,0.713,0.730,0.730,0.835,0.533 +262,1.233,1.233,0.714,0.729,0.729,0.835,0.533 +263,1.234,1.234,0.712,0.730,0.730,0.835,0.533 +264,1.237,1.237,0.716,0.730,0.730,0.835,0.533 +265,1.236,1.236,0.715,0.730,0.730,0.835,0.533 +266,1.235,1.235,0.713,0.730,0.730,0.835,0.533 +267,1.233,1.233,0.713,0.730,0.730,0.835,0.533 +268,1.238,1.238,0.714,0.730,0.730,0.835,0.533 +269,1.234,1.234,0.713,0.730,0.730,0.835,0.533 +270,1.233,1.233,0.715,0.730,0.730,0.835,0.533 +271,1.235,1.235,0.714,0.730,0.730,0.835,0.533 +272,1.233,1.233,0.710,0.730,0.730,0.835,0.533 +273,1.234,1.234,0.713,0.730,0.730,0.835,0.533 +274,1.235,1.235,0.717,0.730,0.730,0.835,0.533 +275,1.235,1.235,0.715,0.730,0.730,0.835,0.533 +276,1.235,1.235,0.712,0.730,0.730,0.836,0.533 +277,1.231,1.231,0.711,0.731,0.731,0.836,0.533 +278,1.233,1.233,0.711,0.730,0.730,0.836,0.533 +279,1.232,1.232,0.711,0.730,0.730,0.836,0.533 +280,1.232,1.232,0.712,0.730,0.730,0.835,0.533 +281,1.228,1.228,0.710,0.730,0.730,0.836,0.533 +282,1.228,1.228,0.710,0.730,0.730,0.836,0.534 +283,1.229,1.229,0.710,0.730,0.730,0.836,0.534 +284,1.231,1.231,0.710,0.730,0.730,0.836,0.534 +285,1.227,1.227,0.708,0.731,0.731,0.836,0.534 +286,1.227,1.227,0.707,0.731,0.731,0.836,0.534 +287,1.229,1.229,0.707,0.730,0.730,0.836,0.534 +288,1.228,1.228,0.709,0.730,0.730,0.836,0.534 +289,1.226,1.226,0.708,0.731,0.731,0.836,0.534 +290,1.227,1.227,0.707,0.731,0.731,0.836,0.534 +291,1.227,1.227,0.707,0.731,0.731,0.836,0.534 +292,1.227,1.227,0.709,0.731,0.731,0.836,0.534 +293,1.233,1.233,0.711,0.731,0.731,0.836,0.534 +294,1.227,1.227,0.709,0.731,0.731,0.836,0.534 +295,1.232,1.232,0.710,0.731,0.731,0.836,0.534 +296,1.222,1.222,0.705,0.731,0.731,0.836,0.534 +297,1.228,1.228,0.710,0.732,0.732,0.836,0.534 +298,1.226,1.226,0.708,0.731,0.731,0.836,0.534 +299,1.227,1.227,0.708,0.732,0.732,0.836,0.534 +300,1.224,1.224,0.706,0.732,0.732,0.836,0.534 +301,1.225,1.225,0.704,0.732,0.732,0.836,0.534 +302,1.225,1.225,0.709,0.732,0.732,0.836,0.534 +303,1.226,1.226,0.706,0.732,0.732,0.836,0.534 +304,1.227,1.227,0.705,0.732,0.732,0.836,0.534 +305,1.230,1.230,0.708,0.732,0.732,0.836,0.534 +306,1.227,1.227,0.706,0.732,0.732,0.836,0.534 +307,1.225,1.225,0.706,0.732,0.732,0.836,0.534 +308,1.222,1.222,0.706,0.732,0.732,0.836,0.534 +309,1.225,1.225,0.704,0.733,0.733,0.836,0.534 +310,1.227,1.227,0.706,0.732,0.732,0.836,0.534 +311,1.224,1.224,0.706,0.733,0.733,0.836,0.534 +312,1.229,1.229,0.708,0.732,0.732,0.836,0.534 +313,1.223,1.223,0.704,0.733,0.733,0.837,0.534 +314,1.224,1.224,0.706,0.732,0.732,0.837,0.535 +315,1.222,1.222,0.705,0.732,0.732,0.836,0.534 +316,1.222,1.222,0.706,0.732,0.732,0.836,0.535 +317,1.224,1.224,0.704,0.732,0.732,0.837,0.535 +318,1.223,1.223,0.703,0.732,0.732,0.836,0.535 +319,1.222,1.222,0.702,0.733,0.733,0.836,0.534 +320,1.224,1.224,0.706,0.732,0.732,0.836,0.535 +321,1.221,1.221,0.704,0.732,0.732,0.836,0.534 +322,1.217,1.217,0.699,0.733,0.733,0.836,0.534 +323,1.219,1.219,0.701,0.732,0.732,0.836,0.535 +324,1.221,1.221,0.701,0.732,0.732,0.836,0.535 +325,1.219,1.219,0.702,0.733,0.733,0.837,0.535 +326,1.223,1.223,0.704,0.732,0.732,0.837,0.535 +327,1.222,1.222,0.700,0.732,0.732,0.836,0.534 +328,1.219,1.219,0.702,0.732,0.732,0.836,0.534 +329,1.217,1.217,0.700,0.732,0.732,0.837,0.535 +330,1.214,1.214,0.700,0.733,0.733,0.836,0.534 +331,1.213,1.213,0.699,0.732,0.732,0.837,0.535 +332,1.220,1.220,0.702,0.732,0.732,0.836,0.535 +333,1.216,1.216,0.698,0.732,0.732,0.837,0.535 +334,1.215,1.215,0.698,0.733,0.733,0.837,0.535 +335,1.216,1.216,0.699,0.733,0.733,0.836,0.535 +336,1.218,1.218,0.700,0.733,0.733,0.836,0.535 +337,1.218,1.218,0.700,0.733,0.733,0.837,0.535 +338,1.213,1.213,0.697,0.733,0.733,0.837,0.535 +339,1.218,1.218,0.698,0.733,0.733,0.837,0.535 +340,1.215,1.215,0.698,0.732,0.732,0.837,0.535 +341,1.219,1.219,0.699,0.733,0.733,0.837,0.535 +342,1.216,1.216,0.698,0.733,0.733,0.837,0.535 +343,1.218,1.218,0.699,0.733,0.733,0.837,0.535 +344,1.215,1.215,0.697,0.732,0.732,0.837,0.535 +345,1.213,1.213,0.695,0.733,0.733,0.837,0.535 +346,1.215,1.215,0.696,0.733,0.733,0.837,0.535 +347,1.213,1.213,0.695,0.733,0.733,0.837,0.535 +348,1.217,1.217,0.698,0.733,0.733,0.837,0.535 +349,1.215,1.215,0.697,0.733,0.733,0.837,0.535 +350,1.214,1.214,0.698,0.733,0.733,0.837,0.535 +351,1.211,1.211,0.696,0.733,0.733,0.837,0.535 +352,1.211,1.211,0.696,0.732,0.732,0.837,0.535 +353,1.208,1.208,0.696,0.733,0.733,0.837,0.535 +354,1.210,1.210,0.695,0.733,0.733,0.837,0.535 +355,1.213,1.213,0.696,0.733,0.733,0.837,0.535 +356,1.212,1.212,0.694,0.733,0.733,0.837,0.535 +357,1.213,1.213,0.698,0.733,0.733,0.837,0.535 +358,1.215,1.215,0.695,0.732,0.732,0.837,0.535 +359,1.204,1.204,0.691,0.732,0.732,0.837,0.535 +360,1.214,1.214,0.695,0.732,0.732,0.837,0.535 +361,1.207,1.207,0.691,0.732,0.732,0.837,0.535 +362,1.206,1.206,0.693,0.732,0.732,0.837,0.535 +363,1.208,1.208,0.692,0.731,0.731,0.837,0.535 +364,1.209,1.209,0.692,0.731,0.731,0.837,0.535 +365,1.205,1.205,0.693,0.731,0.731,0.837,0.535 +366,1.210,1.210,0.695,0.731,0.731,0.837,0.535 +367,1.207,1.207,0.692,0.732,0.732,0.837,0.535 +368,1.207,1.207,0.693,0.732,0.732,0.837,0.535 +369,1.202,1.202,0.686,0.731,0.731,0.837,0.535 +370,1.209,1.209,0.691,0.731,0.731,0.837,0.535 +371,1.212,1.212,0.695,0.731,0.731,0.837,0.535 +372,1.204,1.204,0.689,0.731,0.731,0.837,0.535 +373,1.209,1.209,0.693,0.731,0.731,0.837,0.535 +374,1.207,1.207,0.692,0.731,0.731,0.837,0.535 +375,1.206,1.206,0.691,0.731,0.731,0.837,0.535 +376,1.206,1.206,0.688,0.732,0.732,0.837,0.535 +377,1.208,1.208,0.688,0.732,0.732,0.837,0.535 +378,1.203,1.203,0.689,0.731,0.731,0.837,0.535 +379,1.206,1.206,0.689,0.731,0.731,0.837,0.535 +380,1.206,1.206,0.688,0.731,0.731,0.837,0.535 +381,1.204,1.204,0.688,0.731,0.731,0.837,0.535 +382,1.205,1.205,0.689,0.732,0.732,0.837,0.535 +383,1.201,1.201,0.688,0.731,0.731,0.837,0.535 +384,1.210,1.210,0.692,0.731,0.731,0.837,0.535 +385,1.203,1.203,0.689,0.732,0.732,0.837,0.535 +386,1.204,1.204,0.690,0.732,0.732,0.837,0.535 +387,1.201,1.201,0.688,0.732,0.732,0.837,0.535 +388,1.199,1.199,0.687,0.731,0.731,0.837,0.535 +389,1.200,1.200,0.687,0.731,0.731,0.837,0.535 +390,1.202,1.202,0.687,0.732,0.732,0.837,0.535 +391,1.200,1.200,0.684,0.732,0.732,0.837,0.535 +392,1.206,1.206,0.688,0.732,0.732,0.837,0.535 +393,1.197,1.197,0.684,0.732,0.732,0.838,0.535 +394,1.201,1.201,0.688,0.732,0.732,0.838,0.535 +395,1.200,1.200,0.685,0.732,0.732,0.838,0.535 +396,1.200,1.200,0.685,0.733,0.733,0.838,0.535 +397,1.199,1.199,0.685,0.732,0.732,0.838,0.535 +398,1.197,1.197,0.686,0.733,0.733,0.838,0.535 +399,1.202,1.202,0.689,0.732,0.732,0.838,0.535 +400,1.196,1.196,0.684,0.732,0.732,0.838,0.535 +401,1.200,1.200,0.685,0.733,0.733,0.838,0.535 +402,1.199,1.199,0.686,0.733,0.733,0.838,0.535 +403,1.201,1.201,0.684,0.733,0.733,0.838,0.535 +404,1.195,1.195,0.681,0.733,0.733,0.838,0.535 +405,1.191,1.191,0.680,0.733,0.733,0.838,0.535 +406,1.200,1.200,0.685,0.732,0.732,0.838,0.535 +407,1.197,1.197,0.684,0.732,0.732,0.838,0.535 +408,1.195,1.195,0.680,0.732,0.732,0.838,0.535 +409,1.196,1.196,0.681,0.732,0.732,0.838,0.535 +410,1.195,1.195,0.681,0.732,0.732,0.838,0.535 +411,1.199,1.199,0.682,0.732,0.732,0.838,0.535 +412,1.197,1.197,0.684,0.732,0.732,0.838,0.535 +413,1.194,1.194,0.683,0.732,0.732,0.838,0.535 +414,1.192,1.192,0.680,0.732,0.732,0.838,0.535 +415,1.193,1.193,0.681,0.732,0.732,0.838,0.535 +416,1.194,1.194,0.680,0.732,0.732,0.838,0.535 +417,1.194,1.194,0.679,0.732,0.732,0.838,0.535 +418,1.196,1.196,0.682,0.733,0.733,0.838,0.535 +419,1.194,1.194,0.680,0.733,0.733,0.838,0.535 +420,1.196,1.196,0.681,0.732,0.732,0.838,0.535 +421,1.193,1.193,0.680,0.733,0.733,0.838,0.535 +422,1.192,1.192,0.678,0.733,0.733,0.838,0.535 +423,1.193,1.193,0.681,0.732,0.732,0.838,0.535 +424,1.187,1.187,0.679,0.732,0.732,0.838,0.535 +425,1.188,1.188,0.678,0.732,0.732,0.838,0.535 +426,1.187,1.187,0.679,0.732,0.732,0.838,0.535 +427,1.191,1.191,0.680,0.732,0.732,0.838,0.535 +428,1.190,1.190,0.678,0.733,0.733,0.838,0.535 +429,1.188,1.188,0.678,0.733,0.733,0.838,0.535 +430,1.188,1.188,0.677,0.733,0.733,0.838,0.535 +431,1.185,1.185,0.676,0.732,0.732,0.838,0.535 +432,1.183,1.183,0.675,0.732,0.732,0.838,0.535 +433,1.188,1.188,0.678,0.732,0.732,0.838,0.535 +434,1.191,1.191,0.680,0.732,0.732,0.838,0.535 +435,1.189,1.189,0.675,0.732,0.732,0.838,0.535 +436,1.186,1.186,0.675,0.732,0.732,0.838,0.535 +437,1.187,1.187,0.676,0.733,0.733,0.838,0.535 +438,1.190,1.190,0.676,0.733,0.733,0.838,0.535 +439,1.187,1.187,0.675,0.733,0.733,0.838,0.535 +440,1.188,1.188,0.676,0.733,0.733,0.838,0.535 +441,1.183,1.183,0.673,0.733,0.733,0.838,0.535 +442,1.185,1.185,0.673,0.733,0.733,0.838,0.535 +443,1.187,1.187,0.675,0.733,0.733,0.839,0.535 +444,1.188,1.188,0.676,0.734,0.734,0.839,0.535 +445,1.182,1.182,0.675,0.733,0.733,0.838,0.535 +446,1.188,1.188,0.674,0.733,0.733,0.839,0.535 +447,1.181,1.181,0.671,0.733,0.733,0.838,0.535 +448,1.184,1.184,0.673,0.733,0.733,0.838,0.535 +449,1.180,1.180,0.670,0.733,0.733,0.838,0.535 +450,1.182,1.182,0.672,0.733,0.733,0.838,0.535 +451,1.181,1.181,0.671,0.733,0.733,0.838,0.535 +452,1.181,1.181,0.671,0.733,0.733,0.839,0.535 +453,1.181,1.181,0.672,0.732,0.732,0.839,0.535 +454,1.180,1.180,0.671,0.732,0.732,0.838,0.535 +455,1.185,1.185,0.673,0.732,0.732,0.839,0.535 +456,1.182,1.182,0.671,0.732,0.732,0.838,0.535 +457,1.181,1.181,0.671,0.732,0.732,0.839,0.535 +458,1.176,1.176,0.667,0.733,0.733,0.839,0.535 +459,1.179,1.179,0.670,0.733,0.733,0.838,0.535 +460,1.180,1.180,0.671,0.733,0.733,0.838,0.535 +461,1.175,1.175,0.668,0.733,0.733,0.838,0.535 +462,1.178,1.178,0.667,0.733,0.733,0.838,0.535 +463,1.181,1.181,0.670,0.733,0.733,0.838,0.535 +464,1.178,1.178,0.669,0.733,0.733,0.838,0.535 +465,1.176,1.176,0.666,0.733,0.733,0.838,0.535 +466,1.178,1.178,0.667,0.733,0.733,0.838,0.535 +467,1.179,1.179,0.667,0.733,0.733,0.838,0.535 +468,1.175,1.175,0.667,0.733,0.733,0.838,0.535 +469,1.178,1.178,0.669,0.733,0.733,0.838,0.535 +470,1.177,1.177,0.667,0.733,0.733,0.838,0.535 +471,1.170,1.170,0.663,0.733,0.733,0.838,0.535 +472,1.175,1.175,0.665,0.733,0.733,0.838,0.535 +473,1.171,1.171,0.666,0.733,0.733,0.838,0.535 +474,1.174,1.174,0.665,0.733,0.733,0.838,0.535 +475,1.178,1.178,0.667,0.733,0.733,0.838,0.535 +476,1.176,1.176,0.665,0.732,0.732,0.838,0.535 +477,1.172,1.172,0.665,0.733,0.733,0.838,0.535 +478,1.174,1.174,0.666,0.732,0.732,0.838,0.535 +479,1.174,1.174,0.665,0.732,0.732,0.838,0.535 +480,1.174,1.174,0.665,0.731,0.731,0.838,0.535 +481,1.174,1.174,0.666,0.732,0.732,0.838,0.535 +482,1.168,1.168,0.663,0.732,0.732,0.838,0.535 +483,1.172,1.172,0.664,0.732,0.732,0.838,0.534 +484,1.173,1.173,0.663,0.732,0.732,0.838,0.534 +485,1.167,1.167,0.662,0.732,0.732,0.838,0.534 +486,1.173,1.173,0.666,0.732,0.732,0.838,0.534 +487,1.170,1.170,0.661,0.731,0.731,0.838,0.534 +488,1.169,1.169,0.662,0.731,0.731,0.838,0.534 +489,1.172,1.172,0.663,0.732,0.732,0.838,0.534 +490,1.170,1.170,0.662,0.732,0.732,0.838,0.534 +491,1.169,1.169,0.661,0.731,0.731,0.838,0.534 +492,1.167,1.167,0.660,0.731,0.731,0.838,0.534 +493,1.170,1.170,0.663,0.732,0.732,0.838,0.534 +494,1.171,1.171,0.662,0.731,0.731,0.838,0.534 +495,1.174,1.174,0.664,0.732,0.732,0.838,0.534 +496,1.169,1.169,0.661,0.732,0.732,0.838,0.534 +497,1.162,1.162,0.658,0.732,0.732,0.838,0.534 +498,1.167,1.167,0.660,0.732,0.732,0.838,0.534 +499,1.167,1.167,0.659,0.732,0.732,0.838,0.534 +500,1.169,1.169,0.660,0.732,0.732,0.838,0.534 +501,1.163,1.163,0.658,0.732,0.732,0.838,0.534 +502,1.164,1.164,0.659,0.732,0.732,0.838,0.534 +503,1.166,1.166,0.658,0.732,0.732,0.838,0.534 +504,1.166,1.166,0.659,0.732,0.732,0.838,0.534 +505,1.156,1.156,0.654,0.731,0.731,0.838,0.534 +506,1.165,1.165,0.658,0.732,0.732,0.838,0.534 +507,1.163,1.163,0.658,0.732,0.732,0.838,0.534 +508,1.162,1.162,0.656,0.731,0.731,0.838,0.534 +509,1.161,1.161,0.655,0.731,0.731,0.838,0.534 +510,1.161,1.161,0.655,0.731,0.731,0.838,0.534 +511,1.156,1.156,0.652,0.731,0.731,0.838,0.534 +512,1.156,1.156,0.653,0.731,0.731,0.838,0.534 +513,1.160,1.160,0.656,0.731,0.731,0.838,0.534 +514,1.158,1.158,0.653,0.731,0.731,0.838,0.534 +515,1.158,1.158,0.654,0.731,0.731,0.838,0.534 +516,1.154,1.154,0.653,0.731,0.731,0.838,0.534 +517,1.159,1.159,0.651,0.731,0.731,0.838,0.534 +518,1.157,1.157,0.654,0.731,0.731,0.838,0.534 +519,1.153,1.153,0.649,0.732,0.732,0.838,0.534 +520,1.158,1.158,0.652,0.732,0.732,0.838,0.534 +521,1.160,1.160,0.654,0.731,0.731,0.838,0.534 +522,1.157,1.157,0.652,0.731,0.731,0.838,0.534 +523,1.157,1.157,0.653,0.732,0.732,0.838,0.534 +524,1.154,1.154,0.650,0.731,0.731,0.838,0.534 +525,1.152,1.152,0.647,0.731,0.731,0.838,0.534 +526,1.154,1.154,0.651,0.731,0.731,0.838,0.534 +527,1.156,1.156,0.651,0.732,0.732,0.838,0.534 +528,1.157,1.157,0.652,0.731,0.731,0.838,0.534 +529,1.157,1.157,0.651,0.732,0.732,0.838,0.534 +530,1.151,1.151,0.650,0.732,0.732,0.838,0.534 +531,1.156,1.156,0.652,0.732,0.732,0.838,0.534 +532,1.150,1.150,0.646,0.732,0.732,0.838,0.534 +533,1.154,1.154,0.649,0.731,0.731,0.838,0.534 +534,1.149,1.149,0.646,0.731,0.731,0.838,0.534 +535,1.151,1.151,0.646,0.731,0.731,0.838,0.534 +536,1.148,1.148,0.646,0.731,0.731,0.838,0.534 +537,1.148,1.148,0.646,0.731,0.731,0.838,0.534 +538,1.148,1.148,0.646,0.731,0.731,0.838,0.534 +539,1.151,1.151,0.648,0.731,0.731,0.838,0.533 +540,1.146,1.146,0.647,0.731,0.731,0.838,0.534 +541,1.150,1.150,0.646,0.731,0.731,0.838,0.534 +542,1.146,1.146,0.645,0.731,0.731,0.838,0.533 +543,1.145,1.145,0.646,0.731,0.731,0.838,0.533 +544,1.146,1.146,0.646,0.731,0.731,0.837,0.533 +545,1.147,1.147,0.646,0.731,0.731,0.837,0.533 +546,1.144,1.144,0.643,0.731,0.731,0.837,0.533 +547,1.143,1.143,0.642,0.731,0.731,0.837,0.533 +548,1.149,1.149,0.645,0.731,0.731,0.837,0.533 +549,1.143,1.143,0.642,0.731,0.731,0.837,0.533 +550,1.145,1.145,0.643,0.731,0.731,0.838,0.533 +551,1.143,1.143,0.641,0.731,0.731,0.838,0.533 +552,1.142,1.142,0.641,0.731,0.731,0.838,0.533 +553,1.148,1.148,0.643,0.732,0.732,0.838,0.533 +554,1.142,1.142,0.640,0.731,0.731,0.838,0.533 +555,1.136,1.136,0.637,0.731,0.731,0.838,0.533 +556,1.140,1.140,0.641,0.731,0.731,0.838,0.533 +557,1.142,1.142,0.641,0.732,0.732,0.838,0.533 +558,1.142,1.142,0.641,0.732,0.732,0.838,0.533 +559,1.138,1.138,0.640,0.732,0.732,0.838,0.533 +560,1.139,1.139,0.641,0.731,0.731,0.838,0.533 +561,1.137,1.137,0.639,0.732,0.732,0.838,0.533 +562,1.139,1.139,0.640,0.732,0.732,0.838,0.533 +563,1.142,1.142,0.640,0.732,0.732,0.838,0.533 +564,1.135,1.135,0.637,0.732,0.732,0.838,0.533 +565,1.138,1.138,0.640,0.732,0.732,0.837,0.533 +566,1.138,1.138,0.637,0.732,0.732,0.838,0.533 +567,1.139,1.139,0.639,0.732,0.732,0.838,0.533 +568,1.138,1.138,0.640,0.732,0.732,0.838,0.533 +569,1.138,1.138,0.639,0.732,0.732,0.838,0.533 +570,1.140,1.140,0.641,0.732,0.732,0.838,0.533 +571,1.133,1.133,0.634,0.733,0.733,0.838,0.533 +572,1.134,1.134,0.636,0.733,0.733,0.838,0.533 +573,1.133,1.133,0.636,0.732,0.732,0.838,0.533 +574,1.132,1.132,0.636,0.732,0.732,0.838,0.533 +575,1.131,1.131,0.634,0.732,0.732,0.838,0.533 +576,1.138,1.138,0.636,0.733,0.733,0.838,0.533 +577,1.131,1.131,0.635,0.732,0.732,0.838,0.533 +578,1.131,1.131,0.634,0.732,0.732,0.838,0.533 +579,1.133,1.133,0.634,0.733,0.733,0.838,0.532 +580,1.132,1.132,0.634,0.733,0.733,0.838,0.532 +581,1.130,1.130,0.634,0.733,0.733,0.838,0.532 +582,1.132,1.132,0.634,0.733,0.733,0.838,0.532 +583,1.128,1.128,0.632,0.733,0.733,0.838,0.532 +584,1.129,1.129,0.633,0.733,0.733,0.838,0.532 +585,1.131,1.131,0.632,0.733,0.733,0.838,0.532 +586,1.126,1.126,0.630,0.733,0.733,0.838,0.532 +587,1.127,1.127,0.631,0.733,0.733,0.838,0.532 +588,1.134,1.134,0.634,0.732,0.732,0.838,0.532 +589,1.129,1.129,0.632,0.732,0.732,0.838,0.532 +590,1.131,1.131,0.633,0.732,0.732,0.838,0.532 +591,1.095,1.095,0.592,0.733,0.733,0.838,0.532 +592,1.081,1.081,0.584,0.733,0.733,0.838,0.532 +593,1.078,1.078,0.581,0.733,0.733,0.838,0.532 +594,1.074,1.074,0.580,0.733,0.733,0.838,0.532 +595,1.075,1.075,0.579,0.733,0.733,0.838,0.532 +596,1.071,1.071,0.576,0.732,0.732,0.838,0.532 +597,1.073,1.073,0.578,0.732,0.732,0.838,0.532 +598,1.071,1.071,0.577,0.733,0.733,0.837,0.532 +599,1.069,1.069,0.576,0.733,0.733,0.838,0.532 +600,1.067,1.067,0.574,0.733,0.733,0.838,0.532 diff --git a/my_pretrained_method/YOLOv8_human/weights/v8_n.pt b/my_pretrained_method/YOLOv8_human/weights/v8_n.pt new file mode 100644 index 0000000000..1f025a03c2 Binary files /dev/null and b/my_pretrained_method/YOLOv8_human/weights/v8_n.pt differ diff --git a/my_pretrained_method/audio_code/wav2vec_age_gender.py b/my_pretrained_method/audio_code/wav2vec_age_gender.py new file mode 100644 index 0000000000..3d4fee61a3 --- /dev/null +++ b/my_pretrained_method/audio_code/wav2vec_age_gender.py @@ -0,0 +1,112 @@ +import numpy as np +import torch +import torch.nn as nn +from transformers import Wav2Vec2Processor +from transformers.models.wav2vec2.modeling_wav2vec2 import ( + Wav2Vec2Model, + Wav2Vec2PreTrainedModel, +) + + +class ModelHead(nn.Module): + r"""Classification head.""" + + def __init__(self, config, num_labels): + + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.final_dropout) + self.out_proj = nn.Linear(config.hidden_size, num_labels) + + def forward(self, features, **kwargs): + + x = features + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + + return x + + +class AgeGenderModel(Wav2Vec2PreTrainedModel): + r"""Speech emotion classifier.""" + + def __init__(self, config): + + super().__init__(config) + + self.config = config + self.wav2vec2 = Wav2Vec2Model(config) + self.age = ModelHead(config, 1) + self.gender = ModelHead(config, 3) + self.init_weights() + + def forward( + self, + input_values, + ): + + outputs = self.wav2vec2(input_values) + hidden_states = outputs[0] + hidden_states = torch.mean(hidden_states, dim=1) + logits_age = self.age(hidden_states) + logits_gender = torch.softmax(self.gender(hidden_states), dim=1) + + return hidden_states, logits_age, logits_gender + + + +# load model from hub +# device = 'cpu' +# model_name = '/mnt1/daoyuan_mm/wav2vec2-large-robust-24-ft-age-gender' +# processor = Wav2Vec2Processor.from_pretrained(model_name) +# model = AgeGenderModel.from_pretrained(model_name) + +# dummy signal +# sampling_rate = 16000 +# signal = np.zeros((1, sampling_rate), dtype=np.float32) + + +def process_func( + x: np.ndarray, + sampling_rate: int, + processor, + model, + device, + embeddings: bool = False, +) -> np.ndarray: + r"""Predict age and gender or extract embeddings from raw audio signal.""" + + # run through processor to normalize signal + # always returns a batch, so we just get the first entry + # then we put it on the device + y = processor(x, sampling_rate=sampling_rate) + y = y['input_values'][0] + y = y.reshape(1, -1) + y = torch.from_numpy(y).to(device) + + # run through model + with torch.no_grad(): + y = model(y) + if embeddings: + y = y[0] + else: + y = torch.hstack([y[1], y[2]]) + + # convert to numpy + y = y.detach().cpu().numpy() + + return y + + +# print(process_func(signal, sampling_rate)) +# # Age female male child +# # [[ 0.33793038 0.2715511 0.2275236 0.5009253 ]] + +# print(process_func(signal, sampling_rate, embeddings=True)) +# Pooled hidden states of last transformer layer +# [[ 0.024444 0.0508722 0.04930823 ... 0.07247854 -0.0697901 +# -0.0170537 ]] diff --git a/my_pretrained_method/llama3_by_aliyun.py b/my_pretrained_method/llama3_by_aliyun.py new file mode 100644 index 0000000000..f4693d2ef9 --- /dev/null +++ b/my_pretrained_method/llama3_by_aliyun.py @@ -0,0 +1,35 @@ +import transformers +import torch +import transformers +import torch +from modelscope import snapshot_download +import os +os.environ['TF_ENABLE_ONEDNN_OPT'] = '2' + +# model_id = snapshot_download("LLM-Research/Meta-Llama-3.1-8B-Instruct") + + +model_id = snapshot_download("LLM-Research/Meta-Llama-3.1-8B-Instruct", + cache_dir="/mnt/zt_pt_model/Meta-Llama-3.1-8B-Instruct") + +pipeline = transformers.pipeline( + "text-generation", + model=model_id, + model_kwargs={"torch_dtype": torch.float16}, + device_map="auto", +) + +messages = [ + {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, + {"role": "user", "content": "Who are you?"}, +] + +# pipeline.eval() + +outputs = pipeline( + messages, + temperature = 0.1, + max_new_tokens=256, +) +print(outputs[0]["generated_text"][-1]) + diff --git a/my_pretrained_method/modelscope_delete.py b/my_pretrained_method/modelscope_delete.py new file mode 100644 index 0000000000..24dcbc8b57 --- /dev/null +++ b/my_pretrained_method/modelscope_delete.py @@ -0,0 +1,6 @@ +from transformers import pipeline +gender_classifier = pipeline(model="/mnt1/daoyuan_mm/pedestrian_gender_recognition") +image_path = "abc.jpg" + +results = gender_classifier(image_path) +print(results) diff --git a/my_pretrained_method/speech_age_gender.py b/my_pretrained_method/speech_age_gender.py new file mode 100644 index 0000000000..9123230df6 --- /dev/null +++ b/my_pretrained_method/speech_age_gender.py @@ -0,0 +1,109 @@ +import numpy as np +import torch +import torch.nn as nn +from transformers import Wav2Vec2Processor +from transformers.models.wav2vec2.modeling_wav2vec2 import ( + Wav2Vec2Model, + Wav2Vec2PreTrainedModel, +) + + +class ModelHead(nn.Module): + r"""Classification head.""" + + def __init__(self, config, num_labels): + + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.final_dropout) + self.out_proj = nn.Linear(config.hidden_size, num_labels) + + def forward(self, features, **kwargs): + + x = features + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + + return x + + +class AgeGenderModel(Wav2Vec2PreTrainedModel): + r"""Speech emotion classifier.""" + + def __init__(self, config): + + super().__init__(config) + + self.config = config + self.wav2vec2 = Wav2Vec2Model(config) + self.age = ModelHead(config, 1) + self.gender = ModelHead(config, 3) + self.init_weights() + + def forward( + self, + input_values, + ): + + outputs = self.wav2vec2(input_values) + hidden_states = outputs[0] + hidden_states = torch.mean(hidden_states, dim=1) + logits_age = self.age(hidden_states) + logits_gender = torch.softmax(self.gender(hidden_states), dim=1) + + return hidden_states, logits_age, logits_gender + + + +# load model from hub +device = 'cpu' +model_name = 'audeering/wav2vec2-large-robust-24-ft-age-gender' +processor = Wav2Vec2Processor.from_pretrained(model_name) +model = AgeGenderModel.from_pretrained(model_name) + +# dummy signal +sampling_rate = 16000 +signal = np.zeros((1, sampling_rate), dtype=np.float32) + + +def process_func( + x: np.ndarray, + sampling_rate: int, + embeddings: bool = False, +) -> np.ndarray: + r"""Predict age and gender or extract embeddings from raw audio signal.""" + + # run through processor to normalize signal + # always returns a batch, so we just get the first entry + # then we put it on the device + y = processor(x, sampling_rate=sampling_rate) + y = y['input_values'][0] + y = y.reshape(1, -1) + y = torch.from_numpy(y).to(device) + + # run through model + with torch.no_grad(): + y = model(y) + if embeddings: + y = y[0] + else: + y = torch.hstack([y[1], y[2]]) + + # convert to numpy + y = y.detach().cpu().numpy() + + return y + + +print(process_func(signal, sampling_rate)) +# Age female male child +# [[ 0.33793038 0.2715511 0.2275236 0.5009253 ]] + +print(process_func(signal, sampling_rate, embeddings=True)) +# Pooled hidden states of last transformer layer +# [[ 0.024444 0.0508722 0.04930823 ... 0.07247854 -0.0697901 +# -0.0170537 ]] diff --git a/my_pretrained_method/use_in_dj.py b/my_pretrained_method/use_in_dj.py new file mode 100644 index 0000000000..4cd90b1c14 --- /dev/null +++ b/my_pretrained_method/use_in_dj.py @@ -0,0 +1,61 @@ +# /tmp/zt_ori_mnt1/mnt/zt_pt_model/cogvlm2-video-llama3-chat +# /tmp/zt_ori_mnt1/mnt/zt_pt_model/llava-onevision-qwen2-7b-ov-hf + +def cogvlm2_video_llama3_chat(video_path, query, model, tokenizer): + import sys + sys.path.append('/home/daoyuan_mm/data_juicer/data_juicer/my_pretrained_method/CogVLM2') + from video_demo.cli_video_demo import load_video + video = load_video(video_path, strategy='chat') + inputs = model.build_conversation_input_ids( + tokenizer=tokenizer, + query=query, + images=[video], + history=[], + template_version='chat' + ) + inputs = { + 'input_ids': inputs['input_ids'].unsqueeze(0).to(DEVICE), + 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(DEVICE), + 'attention_mask': inputs['attention_mask'].unsqueeze(0).to(DEVICE), + 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], + } + gen_kwargs = { + "max_new_tokens": 2048, + "pad_token_id": 128002, + "top_k": 1, + "do_sample": True, + "top_p": 0.1, + "temperature": 0.1, + } + with torch.no_grad(): + outputs = model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + + +def LLaVAOnevision(video_path,processor,model): + import av + from data_juicer.utils.mm_utils import read_video_pyav + # Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos, up to 32 frames) + container = av.open(video_path) + total_frames = container.streams.video[0].frames + indices = np.arange(0, total_frames, total_frames / 8).astype(int) + video = read_video_pyav(container, indices) + + # For videos we have to feed a "video" type instead of "image" + conversation = [ + { + + "role": "user", + "content": [ + {"type": "video"}, + {"type": "text", "text": "Why is this video funny?"}, + ], + }, + ] + + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + inputs = processor(videos=list(video), text=prompt, return_tensors="pt").to(device) + + out = model.generate(**inputs, max_new_tokens=60) + response = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) \ No newline at end of file diff --git a/my_pretrained_method/videollava.py b/my_pretrained_method/videollava.py new file mode 100644 index 0000000000..3f53a3b43e --- /dev/null +++ b/my_pretrained_method/videollava.py @@ -0,0 +1,61 @@ +import os +# os.environ['CUDA_VISIBLE_DEVICES'] = '3' +from PIL import Image +import requests +import numpy as np +import av +from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration + +def read_video_pyav(container, indices): + ''' + Decode the video with PyAV decoder. + + Args: + container (av.container.input.InputContainer): PyAV container. + indices (List[int]): List of frame indices to decode. + + Returns: + np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + +def videollava_generate(model, processor, prompt="USER: