DataProcessing/DataPreparation/image_preparation.py

import pandas as pd
import os
import numpy as np
import pydicom
import json
import SimpleITK as sitk
import sys
import cv2
from os import listdir
from os.path import isfile, join
import random
from shutil import copyfile

random.seed(1)
"""
This script reads 3 csv files generated by BigQuery on MIMIC CXR. Each csv file has a list of DICOM images
for conditions: CHF, Pneumonia and Normal 
The script:
 1) reads and resizes original DICOM images to png,
 2) copies them to a folder (i.e. CHF, Pneumonia, Normal),
 3) finally, creates 38 folders with samples of =<30 images for the eye gaze experiment
"""


if sys.version.startswith('3'):
    unicode = str

def dicom_dataset_to_dict(dicom_header):
    '''
    Auxilary method to convert dicom header to python dictionary
    :param dicom_header: input dicome header
    :return: python dictionary of dicom header
    '''
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            # discard pixel data
            continue
        if type(dicom_value.value) == pydicom.dataset.Dataset:
            dicom_dict[dicom_value.tag] = dicom_dataset_to_dict(dicom_value.value)
        else:
            if type(dicom_value.value) == pydicom.sequence.Sequence:
                for value in dicom_value.value:
                    for m in value:
                        if m.name != 'LUT Data':
                            dicom_dict[m.name] = _convert_value(m.value)
            else:
                v = _convert_value(dicom_value.value)
                dicom_dict[dicom_value.name] = v
    return dicom_dict


def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()


def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('utf-8', 'ignore')
        cv = _sanitise_unicode(s)
    elif t == pydicom.valuerep.DSfloat:
        cv = float(v)
    elif t == pydicom.valuerep.IS:
        cv = int(v)
    elif t == pydicom.valuerep.PersonName3:
        cv = str(v)
    else:
        cv = repr(v)
    return cv


def resize_pad(image, height=1920, width=1024):
    '''
    Resizing and repadding of image to spefic image size by keeping aspect ratio
    :param image: input image to resize and pad
    :param image_size: the
    :param height:
    :param width:
    :return:
    '''
    old_size = image.shape[:2]  # old_size is in (height, width) format

    ratio = float(width) / max(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])

    # new_size should be in (width, height) format

    im = cv2.resize(image, (new_size[1], new_size[0]),interpolation=cv2.INTER_NEAREST)

    delta_w = height - new_size[1]
    delta_h = width - new_size[0]
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)

    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,
                                value=color)
    return new_im, top, bottom, left, right , ratio


def get_dtype(data):
    """Given a dict, generate a nested numpy dtype"""
    fields = []
    for (key, value) in data.items():
        # make strings go to the next 64 character boundary
        # pytables requires an 8 character boundary
        if isinstance(value, unicode):
            value += u' ' * (64 - (len(value) % 64))
            # pytables does not support unicode
            try:
                unicode(value, "ascii")
            except UnicodeError:
                value = unicode(value, "utf-8")
            else:
                # value was valid ASCII data
                pass

        elif isinstance(value, str):
            value += ' ' * (64 - (len(value) % 64))

        if isinstance(value, dict):
            fields.append((key, get_dtype(value)))
        else:
            value = np.array(value)
            fields.append((key, '%s%s' % (value.shape, value.dtype)))
    return np.dtype(fields)

def apply_windowing(image, info):
    '''
    Auxilary method to apply windowing

    :param image: input image
    :param info: dicom info related to DICOM
    :return: image with windowing applied
    '''
    if 'Window Center' not in info:
        image = cv2.normalize(image, dst=np.array([]), alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    else:
        wc = info['Window Center']
        ww = info['Window Width']
        if isinstance(wc, unicode):  # Multiple windowing, choose the first one
            wc = float(eval(wc)[0])
            ww = float(eval(ww)[0])
        if type(wc) != float:
            wc = wc.replace(" ", "")
            wc = wc.replace("'", "")
            wc = wc.replace("[", "")
            wc = wc.replace("]", "")
            wc = wc.split(',')

            ww = ww.replace(" ", "")
            ww = ww.replace("'", "")
            ww = ww.replace("[", "")
            ww = ww.replace("]", "")
            ww = ww.split(',')

            wc = wc[0]
            ww = ww[0]


        wc = float(wc)
        ww = float(ww)
        wl = wc - ww * 0.5
        if wl < 0:
            wl = 0
        wu = wc + ww * 0.5
        image = sitk.GetImageFromArray(image)
        image = sitk.IntensityWindowing(image, wl, wu)
        image = np.asarray(sitk.GetArrayFromImage(image), np.uint16)
    return image


def sample_table(df, condition, dicom_folder, sample_size=30):
    '''
    Main method to sample and prepare the images for the eye gaze experiment
    :param df: dataframe for each condition as prepared from BigQuery notebook (i.e. CHF.csv, pneumonia.csv, normal.csv)
    :param condition: name of the condition to sample (i.e. CHF, pneumonia, normals)
    :param sample_size: number of images to sample per age, gender
    :return: dataframe with the sampled images
    '''


    #need to remove certain dicoms because they are lateral
    df = df[~df['dicom_id'].isin(["5c13613e-ef59f921-1d415722-5b44c97f-aee12446", "20e6d7e5-95dbe5ce-dc5e7723-0999596b-e073bdd1", "71bf1bb2-7e38d563-9c161ee3-358ff751-d60d7764", "bbecb088-ef5b8f91-a2eb4213-5ea73654-8ea1823c" ,"f9827698-aed9071f-1e06447a-8619201f-8dade2da","7c2009f1-ee0f8421-08560c62-1406c85a-e637c07b","84ac762a-197e9336-65d15eee-0760e16d-7df96f81"])]

    #get only those cases that the patine is above 20 years old and less than 80 years old
    df = df[(df.anchor_age != '0 - 10') & (df.anchor_age != '10 - 20') & (df.anchor_age != '> 80')]

    total=0
    conditions_table = pd.DataFrame(columns=df.columns)
    conditions_table["image_ratio"]=""
    conditions_table["image_top"]=""
    conditions_table["image_bottom"]=""
    conditions_table["image_left"]=""
    conditions_table["image_right"]=""

    counter = 0
    #Group by age and gender
    for k, gp in df.groupby(['anchor_age', 'gender']):
        if gp.shape[0] < sample_size:
            sampled_cases = gp.sample(n=gp.shape[0], replace=False, random_state=2)
        else:
            sampled_cases = gp.sample(n=sample_size, replace=False, random_state=2)

        # total+=sampled_cases.shape[0]
        print (k, ' ', sampled_cases.shape)
        group_name = k[0] + '_' + k[1]
        group_name = group_name.replace("'", "").replace(' - ', '-')


        if not os.path.exists(os.path.join('Data',condition)):
            os.mkdir(os.path.join('Data',condition))
            os.mkdir(os.path.join('Data',condition,'All'))


        if not os.path.exists(os.path.join('Data',condition,group_name)):
                os.mkdir(os.path.join('Data',condition,group_name))

        images = sampled_cases["dicom_id"].values
        for i, imagename in enumerate(images):
            ds = pydicom.dcmread(os.path.join(dicom_folder, imagename + '.dcm'))
            if ds.ViewPosition == 'PA' and imagename not in ["5c13613e-ef59f921-1d415722-5b44c97f-aee12446", "20e6d7e5-95dbe5ce-dc5e7723-0999596b-e073bdd1", "71bf1bb2-7e38d563-9c161ee3-358ff751-d60d7764", "bbecb088-ef5b8f91-a2eb4213-5ea73654-8ea1823c" ,"f9827698-aed9071f-1e06447a-8619201f-8dade2da"]:
                counter+=1

                try:
                    dictionary = dicom_dataset_to_dict(ds)
                except:
                    print("error reading dictionary", imagename)
                    dictionary = {}
                image = ds.pixel_array.copy().astype(np.uint16)
                image = apply_windowing(image, dictionary)
                resized_image,  top, bottom, left, right , ratio = resize_pad(image)


                cv2.imwrite(os.path.join('Data','All',imagename + '.png'),resized_image.astype(np.uint8))

                if counter % 20 == 0:
                    blank_image = np.zeros((1080, 1920, 3), dtype=np.uint8)
                    color = (255, 255, 255)
                    blank_image = cv2.circle(blank_image, (960, 540), 30, color, -1)
                    cv2.imwrite(os.path.join('Data','All',  imagename[:len(imagename) - 5] + '.png'), blank_image)

                row = sampled_cases.iloc[i].to_dict()
                row.update({'image_ratio': str(ratio),'image_top': str(top) , 'image_bottom':str(bottom),'image_left':str(left),'image_right':str(right)})
                conditions_table = conditions_table.append(row, ignore_index=True)
                total+=1
            else:
                print(ds.ViewPosition)

    print('Total rows: ',total)
    return conditions_table


if __name__ == '__main__':

    # Replace with the dicom folder you downloaded the MIMIC-CXR Database images
    dicom_folder = '/gpfs/fs0/data/mimic_cxr/images/'


    #Create a new folder to store data the eye gaze experiment
    try:
        os.mkdir('Data')
    except:
        pass
    try:
        os.mkdir(os.path.join('Data', 'All'))
    except:
        pass

    #There are three conditions
    conditions = ['pneumonia','normals', 'CHF']
    for condition in conditions:
        case_list = pd.read_csv(os.path.join('../../Resources',condition+'.csv'))
        if condition == 'CHF':
            #!!! We needed to increase the sampling size for CHF due to limited cases based on age/gender criteria!!!!
            chf_table = sample_table(case_list,condition,dicom_folder,48)
        if condition == 'pneumonia':
            pneumonia_table = sample_table(case_list,condition,dicom_folder)
        if condition == 'normals':
            normals_table = sample_table(case_list,condition,dicom_folder)

    master_sheet = pd.concat([chf_table, pneumonia_table,normals_table])
    master_sheet.to_csv(os.path.join('Data','master_sheet.csv'), index=False, header=True)


    #Read images from folder
    folder = os.path.join('Data','All')

    #Create a new folder 'Sessions' where to store all cases to use for the eye gaze experiment
    #These sessions were used on the eye gaze experiment
    try:
        os.mkdir(os.path.join('Data','Sessions'))
    except:
        pass

    files = [f for f in listdir(folder) if isfile(join(folder, f))]

    random.shuffle(files)

    counter = 1
    for i, file in enumerate(files):
        if i%30 == 0:
            saving_folder = str(counter)
            counter+=1
            try:
                os.mkdir(os.path.join('Data', 'Sessions',saving_folder))
            except:
                pass

        copyfile(os.path.join(folder,file), os.path.join('Data', 'Sessions',saving_folder,file))

    #THE IMAGES FOR THE EYE GAZE EXPERIMENT ARE STORED IN /Data/Sessions/num_folder