Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions auxiliaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import shutil


def create_if_none(dir_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)


def create_or_recreate_dir(dir_path):
if os.path.isdir(dir_path):
shutil.rmtree(dir_path)
os.makedirs(dir_path)


def validate_exists_and_dir(dir_path, arg_name):
if not os.path.exists(dir_path):
raise ValueError("{0} {1} does not exist".format(arg_name, dir_path))

if not os.path.isdir(dir_path):
raise ValueError("{0} {1} is not a dir".format(arg_name, dir_path))


def imap_wrapper(args):
"""
:param args: tuple of the form (func, f_arguments)
:return: result of func(**f_arguments)
"""

func = args[0]
f_args = args[1:]
return func(*f_args)
19 changes: 2 additions & 17 deletions download_archive.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from download_single_item import LesionImageDownloader as ImgDownloader, SegmentationDownloader as SegDownloader
from auxiliaries import create_if_none, imap_wrapper

import argparse
import os
Expand Down Expand Up @@ -54,22 +55,6 @@ def download_archive(num_images_requested, offset, skip_images, segmentation, fi
print('Finished downloading')


def create_if_none(dir_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)


def imap_wrapper(args):
"""
:param args: tuple of the form (func, f_arguments)
:return: result of func(**f_arguments)
"""

func = args[0]
f_args = args[1:]
return func(*f_args)


def get_images_ids(num_images, offset):
"""

Expand Down Expand Up @@ -253,4 +238,4 @@ def main(args):


if __name__ == '__main__':
main(sys.argv[1:])
main(sys.argv[1:])
62 changes: 62 additions & 0 deletions filter_invalid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
from os.path import join
import json
import shutil
import argparse
import multiprocessing
from itertools import repeat
from tqdm import tqdm
from auxiliaries import validate_exists_and_dir, create_or_recreate_dir, imap_wrapper


def filter_invalid_images(images_dir, descs_dir, inv_images_dir, inv_descs_dir, num_processes):
validate_exists_and_dir(images_dir, 'images_dir')
validate_exists_and_dir(descs_dir, 'descs_dir')
# Create the result dirs
create_or_recreate_dir(inv_images_dir)
create_or_recreate_dir(inv_descs_dir)

# Find all the descriptions
images_fnames = os.listdir(images_dir)
descs_fnames = os.listdir(descs_dir)

# Find which descriptions are invalid
# And mark them and their corresponding image for moving
src_paths = []
dst_paths = []
for image_fname, desc_fname in zip(images_fnames, descs_fnames):
valid_desc = True
desc_path = join(descs_dir, desc_fname)
# Load the json
desc = json.load(open(desc_path))
# Validate the description
try:
label = desc['meta']['clinical']['benign_malignant']
if label not in {'benign', 'malignant'}:
valid_desc = False
except KeyError:
valid_desc = False
if not valid_desc:
# The description is invalid.
# Mark it and its corresponding image for moving
image_path = join(images_dir, image_fname)
src_paths += [image_path, desc_path]
dst_paths += [join(inv_images_dir, image_fname), join(inv_descs_dir, desc_fname)]

# Move the invalid descriptions and images to the filtered directories
p = multiprocessing.Pool(processes=num_processes)
list(tqdm(p.imap(imap_wrapper, zip(repeat(shutil.move), src_paths, dst_paths)), total=len(src_paths), desc='Filtering invalid images'))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--images-dir', type=str, help='Directory which holds the images, and only them', required=True)
parser.add_argument('--descs-dir', type=str, help='Directory which holds the descriptions of the images and only them', required=True)
parser.add_argument('--inv-dir', type=str, help='Directory to store the filtered out invalid data', required=True)
parser.add_argument('--p', type=int, help='Number of processes to use in parallel', default=16)
args = parser.parse_args()

inv_images_dir = join(args.inv_dir, 'images')
inv_descs_dir = join(args.inv_dir, 'descs')

filter_invalid_images(images_dir=args.images_dir, descs_dir=args.descs_dir, inv_images_dir=inv_images_dir, inv_descs_dir=inv_descs_dir, num_processes=args.p)