Module medpicpy.paths
medpicpy's functions for finding and filtering paths to image data.
Expand source code
"""
medpicpy's functions for finding and filtering paths
to image data.
"""
import glob
import pathlib
import os
from .utils import remove_sub_paths
def get_paths_to_images(data_dir, extension, path_filters=[""]):
"""Search directory and subdirectories for images with the given
extension.
Optionally takes a list of strings to be applied as filters
to the path e.g. ["CT", "prone"] or ["flair"]. These paths
can then be passed to load_images_from_paths.
get_paths_from_ids is preferred where possible since
this function could return results in a different order depending
on the machine.
Args:
data_dir (str): path to root of dataset
extension (str): file extension to search for
path_filters (list, optional): filters to apply to paths. Defaults to [""].
Returns:
list: list of paths to images
"""
if extension == "": # if the search is limited to directories (e.g. 3d dicoms)
paths = glob.glob(data_dir + "/**/*" + extension, recursive=True)
if path_filters is not [""]:
paths = filter_paths(paths, path_filters)
return paths
else: # if the search is looking for individual files.
root_dir = data_dir
paths = []
for root, dirs, files in os.walk(root_dir):
for name in files:
file_extension = pathlib.Path(name).suffix
path = os.path.join(root, name)
if all([path_filter in path for path_filter in path_filters]) and file_extension == extension:
paths.append(os.path.join(root, name))
return paths
def filter_paths(paths, filters):
"""Filters a list of paths so it only contains
paths containing all of the given filters.
Used by get_paths_to_images.
## Example
```python
import medpicpy as med
paths = ["data/ID-001/PRONE/1.dcm", "data/ID-001/SUPINE/1.dcm", "data/ID-002/PRONE/1.dcm", "data/ID-002/SUPINE/1.dcm"]
filters = ["PRONE"]
paths = med.filter_paths(paths, filters)
print(paths)
# ["data/ID-001/PRONE/1.dcm", "data/ID-002/PRONE/1.dcm"]
```
Args:
paths (array): array of paths
filters (array): filters paths must contain
Returns:
array: paths that contain the filters
"""
paths = [path for path in paths if all([path_filter in path for path_filter in filters])]
return paths
def get_paths_from_ids(data_dir,
ids,
path_filters=[""],
read_individual_files=True):
"""Read in a dataset from a list of patient ids, optionally filtering
the path. e.g. ["CT", "supine", ".nii.gz"].
Use this if your dataset has a structure like
'data_dir/patient_id/.../image'.
Optionally searches for directories instead of
individual files, use this for e.g dicom series.
You may want to include the file extension in the
filters.
## Example
If there is dataset with structure:
```
data/
patient-data.csv
ID-001/
SCANS/
CT/
prone.dcm
supine.dcm
DX/
scan.nii.gz
ID-002/
SCANS/
CT/
prone.dcm
supine.dcm
DX/
scan.nii.gz
ID-003/
SCANS/
CT/
prone.dcm
supine.dcm
DX/
scan.nii.gz
```
then:
```python
import pandas as pd
import medpicpy as med
description = pd.read_csv("data/patient-data.csv")
patient_ids = description("id")
filters = ["CT", "prone"]
image_paths = med.get_paths_from_ids(
"data/",
patient_ids,
filters
)
print(image_paths)
# ["data/ID-001/CT/prone.dcm", "data/ID-002/CT/prone.dcm", "data/ID-003/CT/prone.dcm"]
```
With directory structure like so, with
images in a dicom series then set read_individual_files=False:
```
data/
patient-data.csv
ID-001/
SCANS/
CT/
001.dcm
002.dcm
DX/
scan.nii.gz
ID-002/
SCANS/
CT/
001.dcm
002.dcm
DX/
scan.nii.gz
ID-003/
SCANS/
CT/
001.dcm
002.dcm
DX/
scan.nii.gz
```
then:
```python
import pandas as pd
import medpicpy as med
description = pd.read_csv("data/patient-data.csv")
patient_ids = description("id")
filters = ["CT"]
image_paths = med.get_paths_from_ids(
"data/",
patient_ids,
filters,
read_individual_files=False
)
print(image_paths)
# ["data/ID-001/CT/", "data/ID-002/CT/", "data/ID-003/CT/"]
```
Args:
data_dir (str): path to dataset
ids (list or array-like): list of ids to read in, assuming each
id is a directory in the dataset (e.g. TCIA datasets)
path_filters (list, optional): Any filters to apply to the path.
Defaults to [""].
read_individual_files (bool, optional): specifies to look for
individual files or directories. Defaults to True.
Returns:
array: All paths that match the ids with filters,
in the same order as ids
"""
paths = []
for id_number in ids:
paths_for_id = ""
if read_individual_files:
paths_for_id = glob.glob(data_dir + "/" + id_number + "/**/*", recursive=True)
else:
paths_for_id = glob.glob(data_dir + "/" + id_number + "/**/", recursive=True)
for path_filter in path_filters:
paths_for_id = [path for path in paths_for_id if path_filter in path]
if paths_for_id:
paths_for_id = remove_sub_paths(paths_for_id)
if not paths_for_id:
paths.append(None)
print("Warn: Could not find any paths for id {}".format(id_number))
else:
paths.extend(paths_for_id)
return paths
Functions
def filter_paths(paths, filters)
-
Filters a list of paths so it only contains paths containing all of the given filters.
Used by get_paths_to_images.
Example
import medpicpy as med paths = ["data/ID-001/PRONE/1.dcm", "data/ID-001/SUPINE/1.dcm", "data/ID-002/PRONE/1.dcm", "data/ID-002/SUPINE/1.dcm"] filters = ["PRONE"] paths = med.filter_paths(paths, filters) print(paths) # ["data/ID-001/PRONE/1.dcm", "data/ID-002/PRONE/1.dcm"]
Args
paths
:array
- array of paths
filters
:array
- filters paths must contain
Returns
array
- paths that contain the filters
Expand source code
def filter_paths(paths, filters): """Filters a list of paths so it only contains paths containing all of the given filters. Used by get_paths_to_images. ## Example ```python import medpicpy as med paths = ["data/ID-001/PRONE/1.dcm", "data/ID-001/SUPINE/1.dcm", "data/ID-002/PRONE/1.dcm", "data/ID-002/SUPINE/1.dcm"] filters = ["PRONE"] paths = med.filter_paths(paths, filters) print(paths) # ["data/ID-001/PRONE/1.dcm", "data/ID-002/PRONE/1.dcm"] ``` Args: paths (array): array of paths filters (array): filters paths must contain Returns: array: paths that contain the filters """ paths = [path for path in paths if all([path_filter in path for path_filter in filters])] return paths
def get_paths_from_ids(data_dir, ids, path_filters=[''], read_individual_files=True)
-
Read in a dataset from a list of patient ids, optionally filtering the path. e.g. ["CT", "supine", ".nii.gz"].
Use this if your dataset has a structure like 'data_dir/patient_id/…/image'. Optionally searches for directories instead of individual files, use this for e.g dicom series. You may want to include the file extension in the filters.
Example
If there is dataset with structure:
data/ patient-data.csv ID-001/ SCANS/ CT/ prone.dcm supine.dcm DX/ scan.nii.gz ID-002/ SCANS/ CT/ prone.dcm supine.dcm DX/ scan.nii.gz ID-003/ SCANS/ CT/ prone.dcm supine.dcm DX/ scan.nii.gz
then:
import pandas as pd import medpicpy as med description = pd.read_csv("data/patient-data.csv") patient_ids = description("id") filters = ["CT", "prone"] image_paths = med.get_paths_from_ids( "data/", patient_ids, filters ) print(image_paths) # ["data/ID-001/CT/prone.dcm", "data/ID-002/CT/prone.dcm", "data/ID-003/CT/prone.dcm"]
With directory structure like so, with images in a dicom series then set read_individual_files=False:
data/ patient-data.csv ID-001/ SCANS/ CT/ 001.dcm 002.dcm DX/ scan.nii.gz ID-002/ SCANS/ CT/ 001.dcm 002.dcm DX/ scan.nii.gz ID-003/ SCANS/ CT/ 001.dcm 002.dcm DX/ scan.nii.gz
then:
import pandas as pd import medpicpy as med description = pd.read_csv("data/patient-data.csv") patient_ids = description("id") filters = ["CT"] image_paths = med.get_paths_from_ids( "data/", patient_ids, filters, read_individual_files=False ) print(image_paths) # ["data/ID-001/CT/", "data/ID-002/CT/", "data/ID-003/CT/"]
Args
data_dir
:str
- path to dataset
ids
:list
orarray-like
- list of ids to read in, assuming each id is a directory in the dataset (e.g. TCIA datasets)
path_filters
:list
, optional- Any filters to apply to the path. Defaults to [""].
read_individual_files
:bool
, optional- specifies to look for individual files or directories. Defaults to True.
Returns
array
- All paths that match the ids with filters, in the same order as ids
Expand source code
def get_paths_from_ids(data_dir, ids, path_filters=[""], read_individual_files=True): """Read in a dataset from a list of patient ids, optionally filtering the path. e.g. ["CT", "supine", ".nii.gz"]. Use this if your dataset has a structure like 'data_dir/patient_id/.../image'. Optionally searches for directories instead of individual files, use this for e.g dicom series. You may want to include the file extension in the filters. ## Example If there is dataset with structure: ``` data/ patient-data.csv ID-001/ SCANS/ CT/ prone.dcm supine.dcm DX/ scan.nii.gz ID-002/ SCANS/ CT/ prone.dcm supine.dcm DX/ scan.nii.gz ID-003/ SCANS/ CT/ prone.dcm supine.dcm DX/ scan.nii.gz ``` then: ```python import pandas as pd import medpicpy as med description = pd.read_csv("data/patient-data.csv") patient_ids = description("id") filters = ["CT", "prone"] image_paths = med.get_paths_from_ids( "data/", patient_ids, filters ) print(image_paths) # ["data/ID-001/CT/prone.dcm", "data/ID-002/CT/prone.dcm", "data/ID-003/CT/prone.dcm"] ``` With directory structure like so, with images in a dicom series then set read_individual_files=False: ``` data/ patient-data.csv ID-001/ SCANS/ CT/ 001.dcm 002.dcm DX/ scan.nii.gz ID-002/ SCANS/ CT/ 001.dcm 002.dcm DX/ scan.nii.gz ID-003/ SCANS/ CT/ 001.dcm 002.dcm DX/ scan.nii.gz ``` then: ```python import pandas as pd import medpicpy as med description = pd.read_csv("data/patient-data.csv") patient_ids = description("id") filters = ["CT"] image_paths = med.get_paths_from_ids( "data/", patient_ids, filters, read_individual_files=False ) print(image_paths) # ["data/ID-001/CT/", "data/ID-002/CT/", "data/ID-003/CT/"] ``` Args: data_dir (str): path to dataset ids (list or array-like): list of ids to read in, assuming each id is a directory in the dataset (e.g. TCIA datasets) path_filters (list, optional): Any filters to apply to the path. Defaults to [""]. read_individual_files (bool, optional): specifies to look for individual files or directories. Defaults to True. Returns: array: All paths that match the ids with filters, in the same order as ids """ paths = [] for id_number in ids: paths_for_id = "" if read_individual_files: paths_for_id = glob.glob(data_dir + "/" + id_number + "/**/*", recursive=True) else: paths_for_id = glob.glob(data_dir + "/" + id_number + "/**/", recursive=True) for path_filter in path_filters: paths_for_id = [path for path in paths_for_id if path_filter in path] if paths_for_id: paths_for_id = remove_sub_paths(paths_for_id) if not paths_for_id: paths.append(None) print("Warn: Could not find any paths for id {}".format(id_number)) else: paths.extend(paths_for_id) return paths
def get_paths_to_images(data_dir, extension, path_filters=[''])
-
Search directory and subdirectories for images with the given extension.
Optionally takes a list of strings to be applied as filters to the path e.g. ["CT", "prone"] or ["flair"]. These paths can then be passed to load_images_from_paths.
get_paths_from_ids is preferred where possible since this function could return results in a different order depending on the machine.
Args
data_dir
:str
- path to root of dataset
extension
:str
- file extension to search for
path_filters
:list
, optional- filters to apply to paths. Defaults to [""].
Returns
list
- list of paths to images
Expand source code
def get_paths_to_images(data_dir, extension, path_filters=[""]): """Search directory and subdirectories for images with the given extension. Optionally takes a list of strings to be applied as filters to the path e.g. ["CT", "prone"] or ["flair"]. These paths can then be passed to load_images_from_paths. get_paths_from_ids is preferred where possible since this function could return results in a different order depending on the machine. Args: data_dir (str): path to root of dataset extension (str): file extension to search for path_filters (list, optional): filters to apply to paths. Defaults to [""]. Returns: list: list of paths to images """ if extension == "": # if the search is limited to directories (e.g. 3d dicoms) paths = glob.glob(data_dir + "/**/*" + extension, recursive=True) if path_filters is not [""]: paths = filter_paths(paths, path_filters) return paths else: # if the search is looking for individual files. root_dir = data_dir paths = [] for root, dirs, files in os.walk(root_dir): for name in files: file_extension = pathlib.Path(name).suffix path = os.path.join(root, name) if all([path_filter in path for path_filter in path_filters]) and file_extension == extension: paths.append(os.path.join(root, name)) return paths