Module medpicpy.parsing

medpicpy's higher level functions to abstract over reading in medical imaging data

Expand source code
"""medpicpy's higher level functions to abstract over reading 
in medical imaging data

"""
import glob
from pathlib import Path
from os.path import normpath

import pandas as pd
import numpy as np
import cv2
import logging

from . import io
from .utils import remove_sub_paths
from . import config

logging.getLogger(__name__)

def load_images_from_csv(dataframe, 
image_name_column, 
image_dir_path, 
output_shape,
use_memory_mapping=False):
    """Read in an array of images from paths specified in a csv

    ##Example
    ```python
    import medpicpy as med
    import pandas as pd

    description = pd.read_csv("data.csv") 
    array = med.load_images_from_csv(description, 0, "mini-MIAS/", (224, 224))
    ```
    Args:
        dataframe (pandas.DataFrame): A pandas dataframe from the csv
        image_name_column (index): Index of column with image names
        image_dir_path (string): Path to directory containing images
        output_shape (tuple): Output shape for each image
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False

    Returns:
        np.Array: Array of images in order 
    """
    image_names = dataframe[image_name_column]
    image_paths = image_names.apply(lambda x : image_dir_path + "/" + x)
    image_paths = image_paths.apply(lambda x : normpath(x))

    images = load_images_from_paths(image_paths, output_shape, use_memory_mapping=use_memory_mapping)
    return images

    
def load_bounding_boxes_from_csv(
    dataframe, 
    centre_x_column, 
    centre_y_column, 
    width_column, 
    height_column, 
    x_scale_factor=1,
    y_scale_factor=1
    ): # for bounding boxes need to know if measurements are in pixels or mm
    """Read bounding boxes from dataframe of csv

    ##Example
    ```python
    import medpicpy as med
    import pandas as pd

    description = pd.read_csv("data.csv") 

    # x and y scale factor are new_image_size / original_image_size
    # only set if the images were resized when being loaded in
    x_scale_factor = 224 / 1024
    y_scale_factor = 224 / 1024

    xs, ys, widths, heights = med.load_bounding_boxes_from_csv(
        description, 
        4, 
        5, 
        6, 
        6, 
        x_scale_factor=x_scale_factor, 
        y_scale_factor=y_scale_factor
    )

    ```
    Args:
        dataframe (pandas.DataFrame): Dataframe of csv
        centre_x_column (index): Index of column for x anchor or box
        centre_y_column (index): Index of column for y anchor of box
        width_column (index): Index of column for width of box
        height_column (index): Index of column for heigh of box.
            Can be same as width column for squares or circles.
        x_scale_factor (int, optional): Factor to rescale by if image was reshaped. Defaults to 1.
        y_scale_factor (int, optional): Factor to rescale by if image was reshaped. Defaults to 1.

    Returns:
        tuple: 4 tuple of np.Arrays with x, y, widths and heights
    """
    bbox_xs = dataframe[centre_x_column]
    bbox_xs = bbox_xs.multiply(x_scale_factor)
    xs_array = bbox_xs.to_numpy(dtype=np.float16)

    bbox_ys = dataframe[centre_y_column]
    bbox_ys = bbox_ys.multiply(y_scale_factor)
    ys_array = bbox_ys.to_numpy(dtype=np.float16)


    bbox_widths = dataframe[width_column]
    bbox_widths = bbox_widths.multiply(x_scale_factor)
    widths_array = bbox_widths.to_numpy(dtype=np.float16)

    bbox_heights = dataframe[height_column]
    bbox_heights = bbox_heights.multiply(y_scale_factor)
    heights_array = bbox_heights.to_numpy(dtype=np.float16)

    array_tuple = (xs_array, ys_array, widths_array, heights_array)

    return array_tuple

# To read datasets where the class name is in the directory structure.
# i.e. covid/im001 or no-covid/im001
# pulls the class names from the path and reads in the images
# as a numpy array
# TODO: make this work for 3D images, either make a new function or 
# add optional args (would be slice axis and slices to take)
def load_classes_in_directory_name(directory, 
    image_extension, 
    output_shape, 
    class_level=1,
    slices_to_take=None,
    slice_axis=-2,
    use_memory_mapping=False):
    """Parse datasets where the class name is in the 
    directory structure

    Use this when the class name is one of the directory names
    in the dataset structure. 
    ## Example
    If dataset has directory structure:
    ```
    dataset/
        benign/
            im001.dcm
            im002.dcm
        malignant/
            im001.dcm
            im002.dcm
    ```
    then:
    ```python
        import medpicpy as med
        
        classes, images = med.load_classes_in_directory_name(
            "dataset/",
            ".dcm",
            "(128, 128)"
        )
        print(classes)
        # ["benign", "benign", "malignant", "malignant"]
        print(images.shape)
        # (4, 128, 128)
    ```
    Args:
        directory (path): root directory of dataset
        image_extension (str): Wildcard for identifying images,
             e.g for png's - *.png
        output_shape (tuple): Desired output shape of images
        class_level (int, optional): Which level of directory structure 
            contains class name. Defaults to 1.
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False
    Returns:
        list(str), np.Array : list of classes and corresponding images with correct shape
    """
    path_to_search = directory + "/**/*" + image_extension
    files = glob.glob(path_to_search, recursive=True)
    files = remove_sub_paths(files)
    number_of_files = len(files)
    array_shape = (number_of_files,) + output_shape
    array = io.allocate_array(array_shape, use_memory_mapping=use_memory_mapping)
    classes = np.empty(number_of_files, dtype=object)

    for index, name in enumerate(files):
        parts = Path(name).parts
        class_name = parts[class_level]

        image = io.load_image(name, use_memory_mapping=use_memory_mapping)
        result = cv2.resize(image, output_shape)

        classes[index] = class_name
        array[index] = result
        
    return classes, array



def load_images_from_paths(paths, output_shape, use_memory_mapping=False):
    """2D image loading function that takes an array of 
    paths and an output shape and returns the images in 
    the same order as the paths. Requires every 
    path to have an image and every image to be resizeable 
    to the given output shape.

    For higher dimension images use load_series_from_paths.

    Args:
        paths (list or array-like): paths of images to load
        output_shape (tuple): desired shape of each image
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False
    Returns:
        np.array: all images in numpy format with given shape
    """
    array_length = len(paths)
    array_shape = (array_length,) + output_shape # concat tuples to get shape
    image_array = io.allocate_array(array_shape, use_memory_mapping=use_memory_mapping)

    for i in range(0, array_length):
        print("Loading images {} / {}".format(i + 1, len(paths)), end="\r", flush=True)
        image_name = paths[i]
        image = io.load_image(image_name, use_memory_mapping=use_memory_mapping)
        resized = cv2.resize(image, output_shape)
        image_array[i] = resized
    print("")
    return image_array

# slice axis will be -2 for most things since they 
# are 1 channel, for colour images would probably be -3
# But I don't think you get colour 3D scans
# It would work for multimodal things stacked on top of each other though
def load_series_from_paths(
    paths,
    slice_output_shape, 
    slices_to_take,
    slice_axis=-2,
    use_memory_mapping=False
    ):
    """Load an array of 3D scans into memory from their paths.

    Useful for e.g. CT or MR scans. Takes a list of paths, the output shape
    for each 2D slice and a list containing which slices 
    to take from each image. To take the first 60 slices
    pass range(0, 60). 

    The output shape should be a tuple of (int, int).
    
    Optionally take which axis to reshape the image along.
    For any scans with one channel (grayscale) slices this should 
    be -2, if there is a colour channel (or its some kind 
    of multimodal stack) then the axis would be -3. 

    ## Example
    If there is dataset with structure:
    ```
    data/
        patient-data.csv
        ID-001/
            SCANS/
                CT/
                    prone.nii.gz
        ID-002/
            SCANS/
                CT/
                    prone.nii.gz
        ID-003/
            SCANS/
                CT/
                    prone.nii.gz
    ```
    then:
    ```python
    import pandas as pd
    import medpicpy as med

    description = pd.read_csv("data/patient-data.csv")
    patient_ids = description("id")
    filters = ["CT", "prone"]

    image_paths = med.get_paths_from_ids(
        "data/",
        patient_ids,
        filters
    )

    print(image_paths)
    # ["data/ID-001/CT/prone.nii.gz", "data/ID-002/CT/prone.nii.gz", "data/ID-003/CT/prone.nii.gz"]
    
    slices_to_take = range(60, 120)
    output_slice_shape = (128, 128)    # desired shape of each slice in the scan
    images = med.load_series_from_paths(
        paths, 
        output_slice_shape,
        slices_to_take
        )

    print(images.shape)
    # (3, 60, 128, 128)  
    ```
    
    Args:
        paths (list): list of paths to the scans to load
        slice_output_shape (tuple): shape each slice should be resized to
        slices_to_take (list): list of indices of slices to take
        slice_axis (int, optional): axis to resize along. Defaults to -2.
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False
    Returns:
        np.array: array of all scans with specified size
    """

    output_shape = (len(paths), len(slices_to_take)) + slice_output_shape
    output_array = io.allocate_array(output_shape, use_memory_mapping=use_memory_mapping)
    
    for i in range(0, len(paths)):
        print("Loading images {} / {}".format(i + 1, len(paths)), end="\r", flush=True)
        path = paths[i]
        image = io.load_image(path, use_memory_mapping=False)
        new_image = io.allocate_array(((len(slices_to_take),) + image[0].shape), use_memory_mapping=False)

        for index, slice_index in enumerate(slices_to_take):
            new_image[index] = image[slice_index]
        
        final_shape = new_image.shape[:slice_axis] + slice_output_shape + new_image.shape[:slice_axis + 2]
        final_image = io.allocate_array(final_shape, use_memory_mapping=use_memory_mapping)

        for j in range(final_shape[0]):
            image = new_image[j][slice_axis]
            image = cv2.resize(image, slice_output_shape)
            final_image[j] = image
        
        output_array[i] = final_image
    print("")
    return output_array


def get_length_of_all_series(paths):
    """Find the number of 2D slices 
    in a list of images. These images can 
    be 2D, 3D, or a mixture of both. Also 
    returns the paths that each slice comes from, 
    e.g. if an image contains 250 slices, 
    then that path will be duplicated 250 times 
    in the array so the original scan is known. 

    Args:
        paths (list(str)): paths to images

    Returns:
        int, list(str): the total number of slices, and the paths that
        the images come from. 
    """
    all_series = []
    for index, path in enumerate(paths):
        if index % 100 == 0:
            print(f"Getting length of images ~ {index} of {len(paths)}", end="\r")
            logging.debug(f"Getting length of images ~ {index} of {len(paths)}")

        image = io.load_image(path)
        shape = None if image is None else image.shape
        all_series.append(shape)

    print("finished reading all series")
    none_count = 0
    for series in all_series:
        if series is None:
            none_count += 1
    print(f"{none_count} out of {len(all_series)} could not be read.")

    final_paths = []
    series_and_paths = [(series, paths[i]) for i, series in enumerate(all_series) if series is not None]
    all_series = [series for series in all_series if series is not None]
    number_of_series = 0
    for index, series in enumerate(all_series):
        path = series_and_paths[index][1]   # get the path for the image
        if len(series) == 2:  # then it is already 2D
            number_of_series += 1
            final_paths.append(path)
            continue
        elif len(series) == 3: # if its 3d
            if series[0] == 1:        # if its actually 2d
                number_of_series += 1
                final_paths.append(path)
            elif series[2] == 3:
                if not config.suppress_errors:
                    print("MedPicPy does not currently work with multichannel images")
                    exit(0)
                    #TODO: change to exception                   
                continue
            else:
                for image in series:
                    number_of_series += 1
                    final_paths.append(path)
        else:
            print(f"Its not 2 or 3D!:{series}")
            #TODO: change to exception
            exit(0)
    return number_of_series, final_paths
            
def load_all_slices_from_series(paths,
    all_series_length,
    output_shape,
    use_memory_mapping=False):
    """Load a dataset of 2D slices from a list 
    of 2 or 3 dimensional scans. Use `get_length_of_all_series`
    to find `all_series_length` if it is not known 
    ahead of time. 

    Args:
        paths (list(str)): list of image paths to load
        all_series_length (int): length of output array
        output_shape (tuple): dimensions to resize each slice to 
        use_memory_mapping (bool, optional): store data on drive instead of ram. Defaults to False.

    Returns:
        np Array: numpy array of resized slices
    """
    
    output_array_shape = (all_series_length,) + output_shape
    array = io.allocate_array(output_array_shape, use_memory_mapping=use_memory_mapping)
    images_written = 0
    for image_index, path in enumerate(paths):
        try:
            if images_written == all_series_length:
                print("Breaking early")
                break
            if image_index % 100 == 0:
                print(f"Re-loading image: {image_index} of {len(paths)}", end="\r")
                logging.debug(f"Re-loading image: {image_index} of {len(paths)}")
            
            image = io.load_image(path, use_memory_mapping=use_memory_mapping)
            if image is None:
                continue
            image_shape = image.shape
            if len(image_shape) == 2:
                resized_image = cv2.resize(image, output_shape)
                array[images_written] = resized_image[:]
                images_written += 1
            elif len(image_shape) == 3:

                if image_shape[0] == 1: #its actually 2d
                    resized_image = cv2.resize(image, output_shape)
                    array[images_written] = resized_image[:]
                    images_written += 1
                elif image_shape[2] == 3:
                    continue    #skip rgb
                else:
                    for slice_index in range(len(image)):
                        resized_image = cv2.resize(image[slice_index], output_shape)
                        array[images_written] = resized_image[:]
                        images_written += 1
        except IndexError:
            #TODO: a weird bug, look into why this happens sometimes. 
            if config.suppress_errors:
                logging.debug(f"Suppressing index error in load index for paths:\nAttempted to write to index {images_written}, out of bounds for length {len(array)} on image path {path}")
            else:
                raise
    return array

def load_specific_slices_from_series(
    paths,
    output_shape,
    slices_to_take,
    use_memory_mapping=False):
    """Get specific slice or slices from series of scans.
    Takes path, desired shape and array of slice/slices to 
    take from each series. 

    Args:
        paths (array): array of paths to the series
        output_shape (tuple): desired shape of each slice
        slices_to_take (array of arrays): one array of slices 
            to take for each series
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False

    Returns:
        np.array: every slice as specified by the slices_to_take
    """ 
    all_series = [io.load_image(path, use_memory_mapping=use_memory_mapping) for path in paths]
    chosen = [[] for series in all_series]

    if len(all_series) is not len(slices_to_take):
        print("length of series is not the same as slices array")
        exit(0)
    
    for series in range(0, len(slices_to_take)):
        for slice_index in slices_to_take[series]:
            chosen_slice = all_series[series][slice_index]
            resized_slice = cv2.resize(chosen_slice, output_shape)
            chosen[series].append(resized_slice)
    
    series_lengths = [len(new_series) for new_series in chosen]
    output_array_length = sum(series_lengths)
    output_array_shape = (output_array_length,) + output_shape

    array = io.allocate_array(output_array_shape, use_memory_mapping=use_memory_mapping)

    output_index = 0
    for series_counter in range(0, len(series_lengths)):
        for image_counter in range(0, series_lengths[series_counter]):
            array[output_index] = chosen[series_counter][image_counter]
            output_index += 1
    

    return array

def stack_modalities(arrays, axis=-1):
    """Turn a list of arrays into one multimodal array.
    
    Creates one array where each element has 
    len(arrays) images. 

    ##Example
    If we have a dataset like:
    ```
    dataset/
        ID-1/
            flair.nii.gz
            t1.nii.gz
        ID-2/
            flair.nii.gz
            t1.nii.gz
    ```
    then:
    ```python
    import medpicpy as med 
    modalities = [["flair"], ["t1"]]
    paths_for_modality = [med.get_paths_from_ids(
        "dataset/",
        ["ID-1", "ID-2"],
        path_filters = modality
    ) for modality in modalities]

    arrays = [med.load_series_from_paths(
        paths,
        (128, 128),
        range(60, 80)
    ) for paths in paths_for_modality]

    multimodal_array = med.stack_modalities(arrays)
    print(multimodal_array.shape)
    # (259, 20, 128, 128, 4)
    ```
    You might want to flatten along the first axis after 
    doing this depending on the dimensionality of the model you are using.
    ```python
    flat_multi_modal = multimodal_array.reshape(-1, *multimodal_array.shape[2:])
    print("multi modal shape: ", flat_multi_modal.shape)
    # (5180, 128, 128, 4)
    ```
    Args:
        arrays (array): array of arrays of images to stack on 
            top of each other
        axis (int, optional): The axis to stack along, 
            leaving this default is probably fine. Defaults to -1.

    Returns:
        array: the arrays stacked on top of each other
    """
    return np.stack(arrays, axis=axis)

Functions

def get_length_of_all_series(paths)

Find the number of 2D slices in a list of images. These images can be 2D, 3D, or a mixture of both. Also returns the paths that each slice comes from, e.g. if an image contains 250 slices, then that path will be duplicated 250 times in the array so the original scan is known.

Args

paths (list(str)): paths to images

Returns

int, list(str): the total number of slices, and the paths that the images come from.

Expand source code
def get_length_of_all_series(paths):
    """Find the number of 2D slices 
    in a list of images. These images can 
    be 2D, 3D, or a mixture of both. Also 
    returns the paths that each slice comes from, 
    e.g. if an image contains 250 slices, 
    then that path will be duplicated 250 times 
    in the array so the original scan is known. 

    Args:
        paths (list(str)): paths to images

    Returns:
        int, list(str): the total number of slices, and the paths that
        the images come from. 
    """
    all_series = []
    for index, path in enumerate(paths):
        if index % 100 == 0:
            print(f"Getting length of images ~ {index} of {len(paths)}", end="\r")
            logging.debug(f"Getting length of images ~ {index} of {len(paths)}")

        image = io.load_image(path)
        shape = None if image is None else image.shape
        all_series.append(shape)

    print("finished reading all series")
    none_count = 0
    for series in all_series:
        if series is None:
            none_count += 1
    print(f"{none_count} out of {len(all_series)} could not be read.")

    final_paths = []
    series_and_paths = [(series, paths[i]) for i, series in enumerate(all_series) if series is not None]
    all_series = [series for series in all_series if series is not None]
    number_of_series = 0
    for index, series in enumerate(all_series):
        path = series_and_paths[index][1]   # get the path for the image
        if len(series) == 2:  # then it is already 2D
            number_of_series += 1
            final_paths.append(path)
            continue
        elif len(series) == 3: # if its 3d
            if series[0] == 1:        # if its actually 2d
                number_of_series += 1
                final_paths.append(path)
            elif series[2] == 3:
                if not config.suppress_errors:
                    print("MedPicPy does not currently work with multichannel images")
                    exit(0)
                    #TODO: change to exception                   
                continue
            else:
                for image in series:
                    number_of_series += 1
                    final_paths.append(path)
        else:
            print(f"Its not 2 or 3D!:{series}")
            #TODO: change to exception
            exit(0)
    return number_of_series, final_paths
def load_all_slices_from_series(paths, all_series_length, output_shape, use_memory_mapping=False)

Load a dataset of 2D slices from a list of 2 or 3 dimensional scans. Use get_length_of_all_series() to find all_series_length if it is not known ahead of time.

Args

paths (list(str)): list of image paths to load
all_series_length : int
length of output array
output_shape : tuple
dimensions to resize each slice to
use_memory_mapping : bool, optional
store data on drive instead of ram. Defaults to False.

Returns

np Array
numpy array of resized slices
Expand source code
def load_all_slices_from_series(paths,
    all_series_length,
    output_shape,
    use_memory_mapping=False):
    """Load a dataset of 2D slices from a list 
    of 2 or 3 dimensional scans. Use `get_length_of_all_series`
    to find `all_series_length` if it is not known 
    ahead of time. 

    Args:
        paths (list(str)): list of image paths to load
        all_series_length (int): length of output array
        output_shape (tuple): dimensions to resize each slice to 
        use_memory_mapping (bool, optional): store data on drive instead of ram. Defaults to False.

    Returns:
        np Array: numpy array of resized slices
    """
    
    output_array_shape = (all_series_length,) + output_shape
    array = io.allocate_array(output_array_shape, use_memory_mapping=use_memory_mapping)
    images_written = 0
    for image_index, path in enumerate(paths):
        try:
            if images_written == all_series_length:
                print("Breaking early")
                break
            if image_index % 100 == 0:
                print(f"Re-loading image: {image_index} of {len(paths)}", end="\r")
                logging.debug(f"Re-loading image: {image_index} of {len(paths)}")
            
            image = io.load_image(path, use_memory_mapping=use_memory_mapping)
            if image is None:
                continue
            image_shape = image.shape
            if len(image_shape) == 2:
                resized_image = cv2.resize(image, output_shape)
                array[images_written] = resized_image[:]
                images_written += 1
            elif len(image_shape) == 3:

                if image_shape[0] == 1: #its actually 2d
                    resized_image = cv2.resize(image, output_shape)
                    array[images_written] = resized_image[:]
                    images_written += 1
                elif image_shape[2] == 3:
                    continue    #skip rgb
                else:
                    for slice_index in range(len(image)):
                        resized_image = cv2.resize(image[slice_index], output_shape)
                        array[images_written] = resized_image[:]
                        images_written += 1
        except IndexError:
            #TODO: a weird bug, look into why this happens sometimes. 
            if config.suppress_errors:
                logging.debug(f"Suppressing index error in load index for paths:\nAttempted to write to index {images_written}, out of bounds for length {len(array)} on image path {path}")
            else:
                raise
    return array
def load_bounding_boxes_from_csv(dataframe, centre_x_column, centre_y_column, width_column, height_column, x_scale_factor=1, y_scale_factor=1)

Read bounding boxes from dataframe of csv

Example

import medpicpy as med
import pandas as pd

description = pd.read_csv("data.csv") 

# x and y scale factor are new_image_size / original_image_size
# only set if the images were resized when being loaded in
x_scale_factor = 224 / 1024
y_scale_factor = 224 / 1024

xs, ys, widths, heights = med.load_bounding_boxes_from_csv(
    description, 
    4, 
    5, 
    6, 
    6, 
    x_scale_factor=x_scale_factor, 
    y_scale_factor=y_scale_factor
)

Args

dataframe : pandas.DataFrame
Dataframe of csv
centre_x_column : index
Index of column for x anchor or box
centre_y_column : index
Index of column for y anchor of box
width_column : index
Index of column for width of box
height_column : index
Index of column for heigh of box. Can be same as width column for squares or circles.
x_scale_factor : int, optional
Factor to rescale by if image was reshaped. Defaults to 1.
y_scale_factor : int, optional
Factor to rescale by if image was reshaped. Defaults to 1.

Returns

tuple
4 tuple of np.Arrays with x, y, widths and heights
Expand source code
def load_bounding_boxes_from_csv(
    dataframe, 
    centre_x_column, 
    centre_y_column, 
    width_column, 
    height_column, 
    x_scale_factor=1,
    y_scale_factor=1
    ): # for bounding boxes need to know if measurements are in pixels or mm
    """Read bounding boxes from dataframe of csv

    ##Example
    ```python
    import medpicpy as med
    import pandas as pd

    description = pd.read_csv("data.csv") 

    # x and y scale factor are new_image_size / original_image_size
    # only set if the images were resized when being loaded in
    x_scale_factor = 224 / 1024
    y_scale_factor = 224 / 1024

    xs, ys, widths, heights = med.load_bounding_boxes_from_csv(
        description, 
        4, 
        5, 
        6, 
        6, 
        x_scale_factor=x_scale_factor, 
        y_scale_factor=y_scale_factor
    )

    ```
    Args:
        dataframe (pandas.DataFrame): Dataframe of csv
        centre_x_column (index): Index of column for x anchor or box
        centre_y_column (index): Index of column for y anchor of box
        width_column (index): Index of column for width of box
        height_column (index): Index of column for heigh of box.
            Can be same as width column for squares or circles.
        x_scale_factor (int, optional): Factor to rescale by if image was reshaped. Defaults to 1.
        y_scale_factor (int, optional): Factor to rescale by if image was reshaped. Defaults to 1.

    Returns:
        tuple: 4 tuple of np.Arrays with x, y, widths and heights
    """
    bbox_xs = dataframe[centre_x_column]
    bbox_xs = bbox_xs.multiply(x_scale_factor)
    xs_array = bbox_xs.to_numpy(dtype=np.float16)

    bbox_ys = dataframe[centre_y_column]
    bbox_ys = bbox_ys.multiply(y_scale_factor)
    ys_array = bbox_ys.to_numpy(dtype=np.float16)


    bbox_widths = dataframe[width_column]
    bbox_widths = bbox_widths.multiply(x_scale_factor)
    widths_array = bbox_widths.to_numpy(dtype=np.float16)

    bbox_heights = dataframe[height_column]
    bbox_heights = bbox_heights.multiply(y_scale_factor)
    heights_array = bbox_heights.to_numpy(dtype=np.float16)

    array_tuple = (xs_array, ys_array, widths_array, heights_array)

    return array_tuple
def load_classes_in_directory_name(directory, image_extension, output_shape, class_level=1, slices_to_take=None, slice_axis=-2, use_memory_mapping=False)

Parse datasets where the class name is in the directory structure

Use this when the class name is one of the directory names in the dataset structure.

Example

If dataset has directory structure:

dataset/
    benign/
        im001.dcm
        im002.dcm
    malignant/
        im001.dcm
        im002.dcm

then:

    import medpicpy as med

    classes, images = med.load_classes_in_directory_name(
        "dataset/",
        ".dcm",
        "(128, 128)"
    )
    print(classes)
    # ["benign", "benign", "malignant", "malignant"]
    print(images.shape)
    # (4, 128, 128)

Args

directory : path
root directory of dataset
image_extension : str
Wildcard for identifying images, e.g for png's - *.png
output_shape : tuple
Desired output shape of images
class_level : int, optional
Which level of directory structure contains class name. Defaults to 1.
use_memory_mapping : optional, boolean
store the data on disk instead of in memory. Defaults to False

Returns

list(str), np.Array : list of classes and corresponding images with correct shape

Expand source code
def load_classes_in_directory_name(directory, 
    image_extension, 
    output_shape, 
    class_level=1,
    slices_to_take=None,
    slice_axis=-2,
    use_memory_mapping=False):
    """Parse datasets where the class name is in the 
    directory structure

    Use this when the class name is one of the directory names
    in the dataset structure. 
    ## Example
    If dataset has directory structure:
    ```
    dataset/
        benign/
            im001.dcm
            im002.dcm
        malignant/
            im001.dcm
            im002.dcm
    ```
    then:
    ```python
        import medpicpy as med
        
        classes, images = med.load_classes_in_directory_name(
            "dataset/",
            ".dcm",
            "(128, 128)"
        )
        print(classes)
        # ["benign", "benign", "malignant", "malignant"]
        print(images.shape)
        # (4, 128, 128)
    ```
    Args:
        directory (path): root directory of dataset
        image_extension (str): Wildcard for identifying images,
             e.g for png's - *.png
        output_shape (tuple): Desired output shape of images
        class_level (int, optional): Which level of directory structure 
            contains class name. Defaults to 1.
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False
    Returns:
        list(str), np.Array : list of classes and corresponding images with correct shape
    """
    path_to_search = directory + "/**/*" + image_extension
    files = glob.glob(path_to_search, recursive=True)
    files = remove_sub_paths(files)
    number_of_files = len(files)
    array_shape = (number_of_files,) + output_shape
    array = io.allocate_array(array_shape, use_memory_mapping=use_memory_mapping)
    classes = np.empty(number_of_files, dtype=object)

    for index, name in enumerate(files):
        parts = Path(name).parts
        class_name = parts[class_level]

        image = io.load_image(name, use_memory_mapping=use_memory_mapping)
        result = cv2.resize(image, output_shape)

        classes[index] = class_name
        array[index] = result
        
    return classes, array
def load_images_from_csv(dataframe, image_name_column, image_dir_path, output_shape, use_memory_mapping=False)

Read in an array of images from paths specified in a csv

Example

import medpicpy as med
import pandas as pd

description = pd.read_csv("data.csv") 
array = med.load_images_from_csv(description, 0, "mini-MIAS/", (224, 224))

Args

dataframe : pandas.DataFrame
A pandas dataframe from the csv
image_name_column : index
Index of column with image names
image_dir_path : string
Path to directory containing images
output_shape : tuple
Output shape for each image
use_memory_mapping : optional, boolean
store the data on disk instead of in memory. Defaults to False

Returns

np.Array
Array of images in order
Expand source code
def load_images_from_csv(dataframe, 
image_name_column, 
image_dir_path, 
output_shape,
use_memory_mapping=False):
    """Read in an array of images from paths specified in a csv

    ##Example
    ```python
    import medpicpy as med
    import pandas as pd

    description = pd.read_csv("data.csv") 
    array = med.load_images_from_csv(description, 0, "mini-MIAS/", (224, 224))
    ```
    Args:
        dataframe (pandas.DataFrame): A pandas dataframe from the csv
        image_name_column (index): Index of column with image names
        image_dir_path (string): Path to directory containing images
        output_shape (tuple): Output shape for each image
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False

    Returns:
        np.Array: Array of images in order 
    """
    image_names = dataframe[image_name_column]
    image_paths = image_names.apply(lambda x : image_dir_path + "/" + x)
    image_paths = image_paths.apply(lambda x : normpath(x))

    images = load_images_from_paths(image_paths, output_shape, use_memory_mapping=use_memory_mapping)
    return images
def load_images_from_paths(paths, output_shape, use_memory_mapping=False)

2D image loading function that takes an array of paths and an output shape and returns the images in the same order as the paths. Requires every path to have an image and every image to be resizeable to the given output shape.

For higher dimension images use load_series_from_paths.

Args

paths : list or array-like
paths of images to load
output_shape : tuple
desired shape of each image
use_memory_mapping : optional, boolean
store the data on disk instead of in memory. Defaults to False

Returns

np.array
all images in numpy format with given shape
Expand source code
def load_images_from_paths(paths, output_shape, use_memory_mapping=False):
    """2D image loading function that takes an array of 
    paths and an output shape and returns the images in 
    the same order as the paths. Requires every 
    path to have an image and every image to be resizeable 
    to the given output shape.

    For higher dimension images use load_series_from_paths.

    Args:
        paths (list or array-like): paths of images to load
        output_shape (tuple): desired shape of each image
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False
    Returns:
        np.array: all images in numpy format with given shape
    """
    array_length = len(paths)
    array_shape = (array_length,) + output_shape # concat tuples to get shape
    image_array = io.allocate_array(array_shape, use_memory_mapping=use_memory_mapping)

    for i in range(0, array_length):
        print("Loading images {} / {}".format(i + 1, len(paths)), end="\r", flush=True)
        image_name = paths[i]
        image = io.load_image(image_name, use_memory_mapping=use_memory_mapping)
        resized = cv2.resize(image, output_shape)
        image_array[i] = resized
    print("")
    return image_array
def load_series_from_paths(paths, slice_output_shape, slices_to_take, slice_axis=-2, use_memory_mapping=False)

Load an array of 3D scans into memory from their paths.

Useful for e.g. CT or MR scans. Takes a list of paths, the output shape for each 2D slice and a list containing which slices to take from each image. To take the first 60 slices pass range(0, 60).

The output shape should be a tuple of (int, int).

Optionally take which axis to reshape the image along. For any scans with one channel (grayscale) slices this should be -2, if there is a colour channel (or its some kind of multimodal stack) then the axis would be -3.

Example

If there is dataset with structure:

data/
    patient-data.csv
    ID-001/
        SCANS/
            CT/
                prone.nii.gz
    ID-002/
        SCANS/
            CT/
                prone.nii.gz
    ID-003/
        SCANS/
            CT/
                prone.nii.gz

then:

import pandas as pd
import medpicpy as med

description = pd.read_csv("data/patient-data.csv")
patient_ids = description("id")
filters = ["CT", "prone"]

image_paths = med.get_paths_from_ids(
    "data/",
    patient_ids,
    filters
)

print(image_paths)
# ["data/ID-001/CT/prone.nii.gz", "data/ID-002/CT/prone.nii.gz", "data/ID-003/CT/prone.nii.gz"]

slices_to_take = range(60, 120)
output_slice_shape = (128, 128)    # desired shape of each slice in the scan
images = med.load_series_from_paths(
    paths, 
    output_slice_shape,
    slices_to_take
    )

print(images.shape)
# (3, 60, 128, 128)  

Args

paths : list
list of paths to the scans to load
slice_output_shape : tuple
shape each slice should be resized to
slices_to_take : list
list of indices of slices to take
slice_axis : int, optional
axis to resize along. Defaults to -2.
use_memory_mapping : optional, boolean
store the data on disk instead of in memory. Defaults to False

Returns

np.array
array of all scans with specified size
Expand source code
def load_series_from_paths(
    paths,
    slice_output_shape, 
    slices_to_take,
    slice_axis=-2,
    use_memory_mapping=False
    ):
    """Load an array of 3D scans into memory from their paths.

    Useful for e.g. CT or MR scans. Takes a list of paths, the output shape
    for each 2D slice and a list containing which slices 
    to take from each image. To take the first 60 slices
    pass range(0, 60). 

    The output shape should be a tuple of (int, int).
    
    Optionally take which axis to reshape the image along.
    For any scans with one channel (grayscale) slices this should 
    be -2, if there is a colour channel (or its some kind 
    of multimodal stack) then the axis would be -3. 

    ## Example
    If there is dataset with structure:
    ```
    data/
        patient-data.csv
        ID-001/
            SCANS/
                CT/
                    prone.nii.gz
        ID-002/
            SCANS/
                CT/
                    prone.nii.gz
        ID-003/
            SCANS/
                CT/
                    prone.nii.gz
    ```
    then:
    ```python
    import pandas as pd
    import medpicpy as med

    description = pd.read_csv("data/patient-data.csv")
    patient_ids = description("id")
    filters = ["CT", "prone"]

    image_paths = med.get_paths_from_ids(
        "data/",
        patient_ids,
        filters
    )

    print(image_paths)
    # ["data/ID-001/CT/prone.nii.gz", "data/ID-002/CT/prone.nii.gz", "data/ID-003/CT/prone.nii.gz"]
    
    slices_to_take = range(60, 120)
    output_slice_shape = (128, 128)    # desired shape of each slice in the scan
    images = med.load_series_from_paths(
        paths, 
        output_slice_shape,
        slices_to_take
        )

    print(images.shape)
    # (3, 60, 128, 128)  
    ```
    
    Args:
        paths (list): list of paths to the scans to load
        slice_output_shape (tuple): shape each slice should be resized to
        slices_to_take (list): list of indices of slices to take
        slice_axis (int, optional): axis to resize along. Defaults to -2.
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False
    Returns:
        np.array: array of all scans with specified size
    """

    output_shape = (len(paths), len(slices_to_take)) + slice_output_shape
    output_array = io.allocate_array(output_shape, use_memory_mapping=use_memory_mapping)
    
    for i in range(0, len(paths)):
        print("Loading images {} / {}".format(i + 1, len(paths)), end="\r", flush=True)
        path = paths[i]
        image = io.load_image(path, use_memory_mapping=False)
        new_image = io.allocate_array(((len(slices_to_take),) + image[0].shape), use_memory_mapping=False)

        for index, slice_index in enumerate(slices_to_take):
            new_image[index] = image[slice_index]
        
        final_shape = new_image.shape[:slice_axis] + slice_output_shape + new_image.shape[:slice_axis + 2]
        final_image = io.allocate_array(final_shape, use_memory_mapping=use_memory_mapping)

        for j in range(final_shape[0]):
            image = new_image[j][slice_axis]
            image = cv2.resize(image, slice_output_shape)
            final_image[j] = image
        
        output_array[i] = final_image
    print("")
    return output_array
def load_specific_slices_from_series(paths, output_shape, slices_to_take, use_memory_mapping=False)

Get specific slice or slices from series of scans. Takes path, desired shape and array of slice/slices to take from each series.

Args

paths : array
array of paths to the series
output_shape : tuple
desired shape of each slice
slices_to_take : array of arrays
one array of slices to take for each series
use_memory_mapping : optional, boolean
store the data on disk instead of in memory. Defaults to False

Returns

np.array
every slice as specified by the slices_to_take
Expand source code
def load_specific_slices_from_series(
    paths,
    output_shape,
    slices_to_take,
    use_memory_mapping=False):
    """Get specific slice or slices from series of scans.
    Takes path, desired shape and array of slice/slices to 
    take from each series. 

    Args:
        paths (array): array of paths to the series
        output_shape (tuple): desired shape of each slice
        slices_to_take (array of arrays): one array of slices 
            to take for each series
        use_memory_mapping (optional, boolean): store the data on disk instead of in memory.
            Defaults to False

    Returns:
        np.array: every slice as specified by the slices_to_take
    """ 
    all_series = [io.load_image(path, use_memory_mapping=use_memory_mapping) for path in paths]
    chosen = [[] for series in all_series]

    if len(all_series) is not len(slices_to_take):
        print("length of series is not the same as slices array")
        exit(0)
    
    for series in range(0, len(slices_to_take)):
        for slice_index in slices_to_take[series]:
            chosen_slice = all_series[series][slice_index]
            resized_slice = cv2.resize(chosen_slice, output_shape)
            chosen[series].append(resized_slice)
    
    series_lengths = [len(new_series) for new_series in chosen]
    output_array_length = sum(series_lengths)
    output_array_shape = (output_array_length,) + output_shape

    array = io.allocate_array(output_array_shape, use_memory_mapping=use_memory_mapping)

    output_index = 0
    for series_counter in range(0, len(series_lengths)):
        for image_counter in range(0, series_lengths[series_counter]):
            array[output_index] = chosen[series_counter][image_counter]
            output_index += 1
    

    return array
def stack_modalities(arrays, axis=-1)

Turn a list of arrays into one multimodal array.

Creates one array where each element has len(arrays) images.

Example

If we have a dataset like:

dataset/
    ID-1/
        flair.nii.gz
        t1.nii.gz
    ID-2/
        flair.nii.gz
        t1.nii.gz

then:

import medpicpy as med 
modalities = [["flair"], ["t1"]]
paths_for_modality = [med.get_paths_from_ids(
    "dataset/",
    ["ID-1", "ID-2"],
    path_filters = modality
) for modality in modalities]

arrays = [med.load_series_from_paths(
    paths,
    (128, 128),
    range(60, 80)
) for paths in paths_for_modality]

multimodal_array = med.stack_modalities(arrays)
print(multimodal_array.shape)
# (259, 20, 128, 128, 4)

You might want to flatten along the first axis after doing this depending on the dimensionality of the model you are using.

flat_multi_modal = multimodal_array.reshape(-1, *multimodal_array.shape[2:])
print("multi modal shape: ", flat_multi_modal.shape)
# (5180, 128, 128, 4)

Args

arrays : array
array of arrays of images to stack on top of each other
axis : int, optional
The axis to stack along, leaving this default is probably fine. Defaults to -1.

Returns

array
the arrays stacked on top of each other
Expand source code
def stack_modalities(arrays, axis=-1):
    """Turn a list of arrays into one multimodal array.
    
    Creates one array where each element has 
    len(arrays) images. 

    ##Example
    If we have a dataset like:
    ```
    dataset/
        ID-1/
            flair.nii.gz
            t1.nii.gz
        ID-2/
            flair.nii.gz
            t1.nii.gz
    ```
    then:
    ```python
    import medpicpy as med 
    modalities = [["flair"], ["t1"]]
    paths_for_modality = [med.get_paths_from_ids(
        "dataset/",
        ["ID-1", "ID-2"],
        path_filters = modality
    ) for modality in modalities]

    arrays = [med.load_series_from_paths(
        paths,
        (128, 128),
        range(60, 80)
    ) for paths in paths_for_modality]

    multimodal_array = med.stack_modalities(arrays)
    print(multimodal_array.shape)
    # (259, 20, 128, 128, 4)
    ```
    You might want to flatten along the first axis after 
    doing this depending on the dimensionality of the model you are using.
    ```python
    flat_multi_modal = multimodal_array.reshape(-1, *multimodal_array.shape[2:])
    print("multi modal shape: ", flat_multi_modal.shape)
    # (5180, 128, 128, 4)
    ```
    Args:
        arrays (array): array of arrays of images to stack on 
            top of each other
        axis (int, optional): The axis to stack along, 
            leaving this default is probably fine. Defaults to -1.

    Returns:
        array: the arrays stacked on top of each other
    """
    return np.stack(arrays, axis=axis)