# Importing required libraries.
import os
import sys
import time
import logging
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
from IPython.display import display
# Set pandas options to expand all data within rows
'display.max_columns', None)
pd.set_option('display.max_colwidth', None) pd.set_option(
Step 1: Data Cleaning
Cleaning electronic health records (EHR) data is a multistep process that can vary based on the type of information and its intended use. For instance, the goal of this section is to identify and clean recorded clinical events specifically diagnoses, procedures, medications, and laboratory tests. The cleaning and standardization of laboratory results which include numerical data are covered in Standardizing part of this notebook series.
- Merging desired data tables
- Standardizing
- Removing Irrelevant Information
- Constraining to relevant time window
- Removing Redundant Information
# Setting up Directory to save Cleaned Data
= "/n/scratch/users/v/va67/EHR_TUTORIAL_WORKSPACE"
base_directory = os.path.join(base_directory, 'processed_data', 'cleaned_rawdata')
cleaned_rawdata_directory =True)
os.makedirs(cleaned_rawdata_directory, exist_ok
print(f"Directory created at: {cleaned_rawdata_directory}")
Directory created at: /n/scratch/users/v/va67/EHR_TUTORIAL_WORKSPACE/processed_data/cleaned_rawdata
Note: Within MIMIC-IV data, the records are spread across multiple files or tables. We recommend that you go through the “Getting Started” section of the module to familiarize yourself with the contents, structure, and organization of EHR data in MIMIC.
Cleaning Diagnoses Data as Example
We will walk through the steps involved in cleaning, using the diagnosis data from MIMIC as an example. We will define these steps under a function and use them to clean the other datasets (Procedure, Labs, Medication). First, ensure that you have assembled all the necessary data.
# The hospital admission periods are provided in the admissions.csv and the diagnosis recorded during that period
# are located in the diagnoses_icd.csv
= os.path.join(base_directory, "raw_data", "physionet.org", "files", "mimiciv", "3.1", "hosp","diagnoses_icd.csv")
diagnoses_icd_file = os.path.join(base_directory, "raw_data", "physionet.org", "files", "mimiciv", "3.1", "hosp","admissions.csv")
admissions_file
= pd.read_csv(diagnoses_icd_file, dtype=str)
diagnoses_icd = pd.read_csv(admissions_file, dtype=str)
admissions
# Listing columns in diagnoses and admission csv file
print("Columns in diagnoses_icd.csv", diagnoses_icd.columns.tolist(),"\n")
print("Columns in admissions.csv", admissions.columns.tolist())
Columns in diagnoses_icd.csv ['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version']
Columns in admissions.csv ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admit_provider_id', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'race', 'edregtime', 'edouttime', 'hospital_expire_flag']
Our goal is to identify cooccurring clinical events. In the diagnosis_icd table, you’ll notice that the time component is missing. To estimate when these events happened, we’ll use the data from admissions table. Before proceeding, check if the tables has missing values. Data completeness is a key metric for data quality.
Merging desired data tables
When working with real world data, you will need to check if the data has all of the necessary information. For example, it is important to check the raw data to confirm if there is any missing data and if that data is truly missing or if it is in error.
# Check for missing data in raw files before assembling the data to ensure completeness
print("Diagnoses Table - Missing Values Count")
= pd.DataFrame({'Column': diagnoses_icd.columns,'Missing_Values': diagnoses_icd.isna().sum()})
diagnoses_icd_missing_df
display(diagnoses_icd_missing_df)
print("Admissions Table - Missing Values Count")
= pd.DataFrame({'Column': admissions.columns,'Missing_Values': admissions.isna().sum()})
admissions_missing_df display(admissions_missing_df )
Diagnoses Table - Missing Values Count
Column | Missing_Values | |
---|---|---|
subject_id | subject_id | 0 |
hadm_id | hadm_id | 0 |
seq_num | seq_num | 0 |
icd_code | icd_code | 0 |
icd_version | icd_version | 0 |
Admissions Table - Missing Values Count
Column | Missing_Values | |
---|---|---|
subject_id | subject_id | 0 |
hadm_id | hadm_id | 0 |
admittime | admittime | 0 |
dischtime | dischtime | 0 |
deathtime | deathtime | 534238 |
admission_type | admission_type | 0 |
admit_provider_id | admit_provider_id | 4 |
admission_location | admission_location | 1 |
discharge_location | discharge_location | 149818 |
insurance | insurance | 9355 |
language | language | 775 |
marital_status | marital_status | 13619 |
race | race | 0 |
edregtime | edregtime | 166788 |
edouttime | edouttime | 166788 |
hospital_expire_flag | hospital_expire_flag | 0 |
We only require the subject ID, hadm_id, and admittime from the admissions table to create the timestamped diagnosis dataset and since these columns have no missing values we can proceed.
# Merging diagnoses_icd and admissions tables on 'subject_id' and 'hadm_id' columns
= pd.read_csv(diagnoses_icd_file, dtype=str)
diagnoses_icd = pd.read_csv(admissions_file, dtype=str)
admissions = pd.merge(
timed_diagnoses_icd
diagnoses_icd,"subject_id", "hadm_id", "admittime"]],
admissions[[="left",
how=["subject_id", "hadm_id"],
on
)
display(timed_diagnoses_icd.head())
subject_id | hadm_id | seq_num | icd_code | icd_version | admittime | |
---|---|---|---|---|---|---|
0 | 10000032 | 22595853 | 1 | 5723 | 9 | 2180-05-06 22:23:00 |
1 | 10000032 | 22595853 | 2 | 78959 | 9 | 2180-05-06 22:23:00 |
2 | 10000032 | 22595853 | 3 | 5715 | 9 | 2180-05-06 22:23:00 |
3 | 10000032 | 22595853 | 4 | 07070 | 9 | 2180-05-06 22:23:00 |
4 | 10000032 | 22595853 | 5 | 496 | 9 | 2180-05-06 22:23:00 |
After a merge operation, it’s recommended to check whether the resulting table has any missing values. If there are a significant number of missing values, you should investigate to identify underlying cause. For example, the join columns might be of different data types (like ‘subject_id’ can be integer in one table and string in another table), which can cause the join operation to essentially fail.
= pd.DataFrame({'Column': timed_diagnoses_icd.columns,'Missing_Values': timed_diagnoses_icd.isna().sum()})
admissions_missing_df display(admissions_missing_df)
Column | Missing_Values | |
---|---|---|
subject_id | subject_id | 0 |
hadm_id | hadm_id | 0 |
seq_num | seq_num | 0 |
icd_code | icd_code | 0 |
icd_version | icd_version | 0 |
admittime | admittime | 0 |
# If you have records with missing values under columns of interest (Eg. dates, icd_code and subect_ids), remove them
print(f"Table shape before null records have been removed {timed_diagnoses_icd.shape}")
=['admittime', 'icd_code', 'icd_version', 'subject_id'], how="any", inplace=True)
timed_diagnoses_icd.dropna(subset
print(f"Table shape after null records have been removed {timed_diagnoses_icd.shape}")
Table shape before null records have been removed (6364488, 6)
Table shape after null records have been removed (6364488, 6)
Standardizing
Data standardization is essential for ensuring consistency in analysis. You’ll encounter this concept repeatedly through the data processing workflow - for example, later in this tutorial, you’ll standardize medical codes used for clinical events. In Step 2 notebook, you’ll work on standardizing lab test units, among other tasks.
In this part, you will be working on standardizing dates. Some data tables contain only YYYY-MM-DD information, while others include both the date (YYYY-MM-DD) and time components (HH:MM:SS). To maintain consistency and since we analyze data at the day level, we can truncate the time and keep only the date.
# Removing the time component from the 'admittime' column to keep only the date (YYYY-MM-DD). This is typically done in cases where only the
# date component is relevant for the analysis.
"admittime"] = timed_diagnoses_icd["admittime"].str[:10]
timed_diagnoses_icd[
display(timed_diagnoses_icd.head())
subject_id | hadm_id | seq_num | icd_code | icd_version | admittime | |
---|---|---|---|---|---|---|
0 | 10000032 | 22595853 | 1 | 5723 | 9 | 2180-05-06 |
1 | 10000032 | 22595853 | 2 | 78959 | 9 | 2180-05-06 |
2 | 10000032 | 22595853 | 3 | 5715 | 9 | 2180-05-06 |
3 | 10000032 | 22595853 | 4 | 07070 | 9 | 2180-05-06 |
4 | 10000032 | 22595853 | 5 | 496 | 9 | 2180-05-06 |
Removing Irrelevant Information
We only keep the information we need for downstream task. This will help save computation time and space.
# For diagnosis data, we'll keep 'subject_id', 'icd_code', 'icd_version', and 'admittime'.
= timed_diagnoses_icd[['subject_id','icd_code','icd_version','admittime']]
timed_diagnoses_icd
# Renaming columns to ensure the newly generated datasets are consistent. This is also part of Standardization
# Here we rename 'admittime' to 'date' to ensure consistency with other datasets that will be created later.
= timed_diagnoses_icd.rename(columns={'admittime': 'date'})
timed_diagnoses_icd 5)) display(timed_diagnoses_icd.head(
subject_id | icd_code | icd_version | date | |
---|---|---|---|---|
0 | 10000032 | 5723 | 9 | 2180-05-06 |
1 | 10000032 | 78959 | 9 | 2180-05-06 |
2 | 10000032 | 5715 | 9 | 2180-05-06 |
3 | 10000032 | 07070 | 9 | 2180-05-06 |
4 | 10000032 | 496 | 9 | 2180-05-06 |
Constraining to relevant time window
EHR data often contain erroneous data entries. In real-world EHR data, the time component is susceptible to errors. In this step, we’ll verify that the dates are reasonable by filtering out records that fall outside an acceptable range (for example, before the 1980s and after the current year). The corresponding code is provided below. Note that for MIMIC-IV, we will skip this step since the dates are adjusted and not realistic.
# diagnoses_icd = diagnoses_icd[
# (diagnoses_icd["date"].str[:4].astype(int) >= 1980)
# & (diagnoses_icd["date"].str[:4].astype(int) <= 2024)
# ]
Removing Redundant Information
EHR data also contains redundant information, which can also be introduced during pre-processing. For example, when removing the time component, the same clinical event may appear multiple times across the data. Since we are only concerned with unique events per day, it is important to identify and remove duplicates accordingly.
# Check for duplicated rows in your data
if timed_diagnoses_icd.duplicated().sum() > 0:
=timed_diagnoses_icd.shape[0]
initial_row_count
print(f"Initial table size {initial_row_count}")
print("Duplicate rows found. Removing duplicates :")
= timed_diagnoses_icd.drop_duplicates() # Remove duplicate rows
timed_diagnoses_icd =timed_diagnoses_icd.shape[0]
final_row_count
print(f"Records deleted: {initial_row_count - final_row_count}")
print(f"Table size after removing duplicates : {final_row_count}")
else:
print("No duplicate rows found.")
Initial table size 6364488
Duplicate rows found. Removing duplicates :
Records deleted: 8007
Table size after removing duplicates : 6356481
Handling Large-Scale Data
In real world data, EHR datasets are typically too large to load in the memory at once so we process them in batches.
# First we need to identify the individual patients and then we are gonna assign them to batches. If you do not have admissions file,
# you can simply take the diagnosis file to get the unique patient ids.
= os.path.dirname(os.getcwd())
base_directory = os.path.join(base_directory, "raw_data", "physionet.org", "files", "mimiciv", "3.1", "hosp","admissions.csv")
admissions_file
= pd.read_csv(admissions_file, dtype=str)
admissions 5))
display(admissions.head(
# Getting unique patient ids and sorting them
= admissions[['subject_id']].drop_duplicates()
patient_ids = patient_ids.sort_values(by='subject_id', ascending=True)
patient_ids = patient_ids.reset_index(drop=True)
patient_ids
display(patient_ids.head())
# specify the number of batches you want to have. The larger the data, the more batches you need to have
= 8
num_of_batches
# Assigning batch number from 1 to 8
'batch_num'] = (patient_ids.index % num_of_batches) + 1
patient_ids[
display(patient_ids)
= patient_ids.groupby('batch_num')['subject_id'].count().reset_index().rename(columns={'subject_id': 'patient_count'})
patient_count_per_batch display(patient_count_per_batch)
subject_id | hadm_id | admittime | dischtime | deathtime | admission_type | admit_provider_id | admission_location | discharge_location | insurance | language | marital_status | race | edregtime | edouttime | hospital_expire_flag | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10000032 | 22595853 | 2180-05-06 22:23:00 | 2180-05-07 17:15:00 | NaN | URGENT | P49AFC | TRANSFER FROM HOSPITAL | HOME | Medicaid | English | WIDOWED | WHITE | 2180-05-06 19:17:00 | 2180-05-06 23:30:00 | 0 |
1 | 10000032 | 22841357 | 2180-06-26 18:27:00 | 2180-06-27 18:49:00 | NaN | EW EMER. | P784FA | EMERGENCY ROOM | HOME | Medicaid | English | WIDOWED | WHITE | 2180-06-26 15:54:00 | 2180-06-26 21:31:00 | 0 |
2 | 10000032 | 25742920 | 2180-08-05 23:44:00 | 2180-08-07 17:50:00 | NaN | EW EMER. | P19UTS | EMERGENCY ROOM | HOSPICE | Medicaid | English | WIDOWED | WHITE | 2180-08-05 20:58:00 | 2180-08-06 01:44:00 | 0 |
3 | 10000032 | 29079034 | 2180-07-23 12:35:00 | 2180-07-25 17:55:00 | NaN | EW EMER. | P06OTX | EMERGENCY ROOM | HOME | Medicaid | English | WIDOWED | WHITE | 2180-07-23 05:54:00 | 2180-07-23 14:00:00 | 0 |
4 | 10000068 | 25022803 | 2160-03-03 23:16:00 | 2160-03-04 06:26:00 | NaN | EU OBSERVATION | P39NWO | EMERGENCY ROOM | NaN | NaN | English | SINGLE | WHITE | 2160-03-03 21:55:00 | 2160-03-04 06:26:00 | 0 |
subject_id | |
---|---|
0 | 10000032 |
1 | 10000068 |
2 | 10000084 |
3 | 10000108 |
4 | 10000117 |
subject_id | batch_num | |
---|---|---|
0 | 10000032 | 1 |
1 | 10000068 | 2 |
2 | 10000084 | 3 |
3 | 10000108 | 4 |
4 | 10000117 | 5 |
... | ... | ... |
223447 | 19999733 | 8 |
223448 | 19999784 | 1 |
223449 | 19999828 | 2 |
223450 | 19999840 | 3 |
223451 | 19999987 | 4 |
223452 rows × 2 columns
batch_num | patient_count | |
---|---|---|
0 | 1 | 27932 |
1 | 2 | 27932 |
2 | 3 | 27932 |
3 | 4 | 27932 |
4 | 5 | 27931 |
5 | 6 | 27931 |
6 | 7 | 27931 |
7 | 8 | 27931 |
For the full notebook with code, please visit here.