# Importing required libraries and setting up the workspace
import os
import sys
import time
import logging
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
from IPython.display import display
# Set pandas options to expand all data within rows
'display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option(
#Update base directory to your EHR_TUTORIAL_WORKSPACE path
="/path/to/your/EHR_TUTORIAL_WORKSPACE"
base_directory
# Setting up Directory to save Cleaned Data
= os.path.join(base_directory, 'processed_data', 'step3_cleaned_rawdata')
cleaned_rawdata_directory =True)
os.makedirs(cleaned_rawdata_directory, exist_ok
print(f"Directory created at: {cleaned_rawdata_directory}")
Step 1: Data Cleaning
Cleaning EHR data involves multiple substeps that vary depending on the type of data modality and the intended downstream analyis.
In this tutorial, we focus on cleaning the structured data from the MIMIC-IV hosp module by
- Assessing Missingness
- Filtering Irrelevant Information
- Standardizing Schema
- Identifying Erronous Data
- Removing Redundant Data
Note: Within MIMIC-IV data, the records are spread across multiple files or tables. We recommend that you go through the “Getting Started” section of the module to familiarize yourself with the contents, structure, and organization of EHR data in MIMIC.
Cleaning Diagnoses Data as Example
We will walk through the steps involved in cleaning, using the diagnosis data from MIMIC as an example.. Once these steps are outlined, we will encapsulate them into a reusable function which we will use on other mimic datasets (Procedure, Labs, Medication) required for this tutorial. First, ensure that you have assembled all the necessary data.
# The hospital admission periods are provided in the admissions.csv and the diagnosis recorded during that period
# are located in the diagnoses_icd.csv
= os.path.join(base_directory, "raw_data", "physionet.org", "files", "mimiciv", "3.1", "hosp","diagnoses_icd.csv")
diagnoses_icd_file = os.path.join(base_directory, "raw_data", "physionet.org", "files", "mimiciv", "3.1", "hosp","admissions.csv")
admissions_file
= pd.read_csv(diagnoses_icd_file, dtype=str)
diagnoses_icd = pd.read_csv(admissions_file, dtype=str)
admissions
# Listing columns in diagnoses and admission csv file
print("Columns in diagnoses_icd.csv", diagnoses_icd.columns.tolist(),"\n")
print("Columns in admissions.csv", admissions.columns.tolist())
Columns in diagnoses_icd.csv ['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version']
Columns in admissions.csv ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admit_provider_id', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'race', 'edregtime', 'edouttime', 'hospital_expire_flag']
Our goal is to identify cooccurring clinical events. In the diagnosis_icd table, you’ll notice that the time component is missing. To estimate when these events happened, we’ll use the data from admissions table. Before proceeding, check if the tables has missing values. Data completeness is a key metric for data quality.
Assessing Missingness
Now before combining the tables, we need to check for missing values in the raw EHR files. While working with large scale EHR data, rows with missing dates or ICD codes are typically removed if the proportion of missing values is small. However, if you find substantial records with missing values, further investigation is required before proceeding as systematic missingness may indicate underlying data quality issues.
# Check for missing data in raw files before assembling the data to ensure completeness
print("Diagnoses Table - Missing Values Count")
= pd.DataFrame({'Column': diagnoses_icd.columns,'Missing_Values': diagnoses_icd.isna().sum()})
diagnoses_icd_missing_df
display(diagnoses_icd_missing_df)
print("Admissions Table - Missing Values Count")
= pd.DataFrame({'Column': admissions.columns,'Missing_Values': admissions.isna().sum()})
admissions_missing_df display(admissions_missing_df )
Diagnoses Table - Missing Values Count
Column | Missing_Values | |
---|---|---|
subject_id | subject_id | 0 |
hadm_id | hadm_id | 0 |
seq_num | seq_num | 0 |
icd_code | icd_code | 0 |
icd_version | icd_version | 0 |
Admissions Table - Missing Values Count
Column | Missing_Values | |
---|---|---|
subject_id | subject_id | 0 |
hadm_id | hadm_id | 0 |
admittime | admittime | 0 |
dischtime | dischtime | 0 |
deathtime | deathtime | 534238 |
admission_type | admission_type | 0 |
admit_provider_id | admit_provider_id | 4 |
admission_location | admission_location | 1 |
discharge_location | discharge_location | 149818 |
insurance | insurance | 9355 |
language | language | 775 |
marital_status | marital_status | 13619 |
race | race | 0 |
edregtime | edregtime | 166788 |
edouttime | edouttime | 166788 |
hospital_expire_flag | hospital_expire_flag | 0 |
Filtering Irrelevant Information
Next, we will identify the variables required for the downstream task and retain only those. For this example, we require subject_id, hadm_id, and icd_code from the diagnoses table, and subject_id, hadm_id, and admittime from the admissions table to construct the timestamped diagnoses dataset. Since there are no missing values in these fields, we can proceed with joining the tables.
Note: Not all datasets require the same set of variables.
For example, when working with laboratory data, it is important to retain the laboratory values and their associated units in addition to the laboratory codes. The specific variables to be preserved will depend on the requirements of the downstream task.
In this tutorial, we do not make use of laboratory result values.
# Merging diagnoses_icd and admissions tables on 'subject_id' and 'hadm_id' columns
= pd.read_csv(diagnoses_icd_file, dtype=str)
diagnoses_icd = pd.read_csv(admissions_file, dtype=str)
admissions = pd.merge(
timed_diagnoses_icd
diagnoses_icd,"subject_id", "hadm_id", "admittime"]],
admissions[[="left",
how=["subject_id", "hadm_id"],
on
)
display(timed_diagnoses_icd.head())
subject_id | hadm_id | seq_num | icd_code | icd_version | admittime | |
---|---|---|---|---|---|---|
0 | 10000032 | 22595853 | 1 | 5723 | 9 | 2180-05-06 22:23:00 |
1 | 10000032 | 22595853 | 2 | 78959 | 9 | 2180-05-06 22:23:00 |
2 | 10000032 | 22595853 | 3 | 5715 | 9 | 2180-05-06 22:23:00 |
3 | 10000032 | 22595853 | 4 | 07070 | 9 | 2180-05-06 22:23:00 |
4 | 10000032 | 22595853 | 5 | 496 | 9 | 2180-05-06 22:23:00 |
After performing any merge operation, it’s recommended to check whether the resulting table has any missing values.
Note: If there are a significant number of missing values, you should investigate further to identify underlying cause. For example, the join columns might be of different data types (like ‘subject_id’ can be integer in one table and string in another table), which can cause the join operation to fail.
= pd.DataFrame({'Column': timed_diagnoses_icd.columns,'Missing_Values': timed_diagnoses_icd.isna().sum()})
admissions_missing_df display(admissions_missing_df)
Column | Missing_Values | |
---|---|---|
subject_id | subject_id | 0 |
hadm_id | hadm_id | 0 |
seq_num | seq_num | 0 |
icd_code | icd_code | 0 |
icd_version | icd_version | 0 |
admittime | admittime | 0 |
In the EHR Preprocessing: Getting Started section, we introduced the key data elements: Patient ID (subject_id
), Event/Observation (icd_code
), and Time (date
). Records missing any of these essential elements should be removed prior to further analysis.
print(f"Table shape before null records have been removed {timed_diagnoses_icd.shape}")
=['admittime', 'icd_code', 'icd_version', 'subject_id'], how="any", inplace=True)
timed_diagnoses_icd.dropna(subsetprint(f"Table shape after null records have been removed {timed_diagnoses_icd.shape}")
Table shape before null records have been removed (6364488, 6)
Table shape after null records have been removed (6364488, 6)
Next, we truncate the timestamp to retain only the date, since the hour/minute/second are not required for our analysis.
Note: When working with datasets time series medical data like ECGs, the full timestamp should be preserved.
For the purposes of this tutorial, the date alone is sufficient.
# Removing the time component from the 'admittime' column to keep only the date (YYYY-MM-DD). This is typically done in cases where only the
# date component is relevant for the analysis.
"admittime"] = timed_diagnoses_icd["admittime"].str[:10]
timed_diagnoses_icd[
display(timed_diagnoses_icd.head())
subject_id | hadm_id | seq_num | icd_code | icd_version | admittime | |
---|---|---|---|---|---|---|
0 | 10000032 | 22595853 | 1 | 5723 | 9 | 2180-05-06 |
1 | 10000032 | 22595853 | 2 | 78959 | 9 | 2180-05-06 |
2 | 10000032 | 22595853 | 3 | 5715 | 9 | 2180-05-06 |
3 | 10000032 | 22595853 | 4 | 07070 | 9 | 2180-05-06 |
4 | 10000032 | 22595853 | 5 | 496 | 9 | 2180-05-06 |
Standardizing Schema
Cleaning a dataset also involves renaming and restructuring columns and tables to ensure that newly generated datasets remain consistent. Here, we rename admittime to date to maintain consistency with other datasets that will be created later.
Note: This convention allows alignment across different dataset, such as medications, laboratory results, and procedures, which will also use date as the standard timestamp field.
# For diagnosis data, we'll keep 'subject_id', 'icd_code', 'icd_version', and 'admittime'.
= timed_diagnoses_icd[['subject_id','icd_code','icd_version','admittime']]
timed_diagnoses_icd
# Renaming columns to ensure the newly generated datasets are consistent. This is also part of Standardization
# Here we rename 'admittime' to 'date' to ensure consistency with other datasets that will be created later.
= timed_diagnoses_icd.rename(columns={'admittime': 'date'})
timed_diagnoses_icd 5)) display(timed_diagnoses_icd.head(
subject_id | icd_code | icd_version | date | |
---|---|---|---|---|
0 | 10000032 | 5723 | 9 | 2180-05-06 |
1 | 10000032 | 78959 | 9 | 2180-05-06 |
2 | 10000032 | 5715 | 9 | 2180-05-06 |
3 | 10000032 | 07070 | 9 | 2180-05-06 |
4 | 10000032 | 496 | 9 | 2180-05-06 |
Identifying Erronous Data
Define a valid date range for the dataset (not applicable to MIMIC-IV).
It is important in real-world datasets to ensure that dates fall within a reasonable range. For instance, records with dates prior to the 1980s or beyond the current year should be excluded. The code for this operation is provided below. For MIMIC-IV, however this step is not required since dates are adjusted and won’t make sense.
Note on laboratory data:
Erroneous values and outliers are also frequently encountered in laboratory datasets. While impossible values can be safely filtered out, other issues require more care. Laboratory results may appear erroneous when the same test is recorded in different units (for example, hemoglobin reported in g/dL versus mmol/L). In such cases, applying domain knowledge to normalize units is essential for ensuring consistency and preventing valid results from being misclassified as errors. For this tutorial, we do not use laboratory values, but these checks are critical when working with real world lab data.
# diagnoses_icd = diagnoses_icd[
# (diagnoses_icd["date"].str[:4].astype(int) >= 1980)
# & (diagnoses_icd["date"].str[:4].astype(int) <= 2024)
# ]
Removing Redundant Data
Duplicate records are prevalent in EHR datasets and can arise in multiple forms.
These include exact duplicates, where identical rows are repeated, and semantic duplicates, where different medical codes represent the same underlying clinical concept.
The deduplication step is applied both before and after code standardization (described in the Code Rollup section) to ensure that all redundant information is removed. This process is essential to prevent double counting and reducing data noise.
# Check for duplicated rows in your data
if timed_diagnoses_icd.duplicated().sum() > 0:
=timed_diagnoses_icd.shape[0]
initial_row_count
print(f"Initial table size {initial_row_count}")
print("Duplicate rows found. Removing duplicates :")
= timed_diagnoses_icd.drop_duplicates() # Remove duplicate rows
timed_diagnoses_icd =timed_diagnoses_icd.shape[0]
final_row_count
print(f"Records deleted: {initial_row_count - final_row_count}")
print(f"Table size after removing duplicates : {final_row_count}")
else:
print("No duplicate rows found.")
Initial table size 6364488
Duplicate rows found. Removing duplicates :
Records deleted: 8007
Table size after removing duplicates : 6356481
Handling Large Scale Data
Real world EHR datasets are typically too large to load into memory at once. As a result, processing datasets in batches becomes crucial to speed up the data processing by processing data in a parallel fasion and ensuring you don’t hit memory limits.
For batching, the first step is to identify the unique patients since batching is usually performed at the patient level to make sure that all records for an individual are processed together. Once unique patients are identified, they can be split into batches for iterative or parallel processing.
# First we need to identify the individual patients and then we are gonna assign them to batches. If you do not have admissions file,
# you can simply take the diagnosis file to get the unique patient ids.
= os.path.dirname(os.getcwd())
base_directory = os.path.join(base_directory, "raw_data", "physionet.org", "files", "mimiciv", "3.1", "hosp","admissions.csv")
admissions_file
= pd.read_csv(admissions_file, dtype=str)
admissions 5))
display(admissions.head(
# Getting unique patient ids and sorting them
= admissions[['subject_id']].drop_duplicates()
patient_ids = patient_ids.sort_values(by='subject_id', ascending=True)
patient_ids = patient_ids.reset_index(drop=True)
patient_ids
display(patient_ids.head())
# specify the number of batches you want to have. The larger the data, the more batches you need to have
= 8
num_of_batches
# Assigning batch number from 1 to 8
'batch_num'] = (patient_ids.index % num_of_batches) + 1
patient_ids[
display(patient_ids)
= patient_ids.groupby('batch_num')['subject_id'].count().reset_index().rename(columns={'subject_id': 'patient_count'})
patient_count_per_batch display(patient_count_per_batch)
subject_id | hadm_id | admittime | dischtime | deathtime | admission_type | admit_provider_id | admission_location | discharge_location | insurance | language | marital_status | race | edregtime | edouttime | hospital_expire_flag | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10000032 | 22595853 | 2180-05-06 22:23:00 | 2180-05-07 17:15:00 | NaN | URGENT | P49AFC | TRANSFER FROM HOSPITAL | HOME | Medicaid | English | WIDOWED | WHITE | 2180-05-06 19:17:00 | 2180-05-06 23:30:00 | 0 |
1 | 10000032 | 22841357 | 2180-06-26 18:27:00 | 2180-06-27 18:49:00 | NaN | EW EMER. | P784FA | EMERGENCY ROOM | HOME | Medicaid | English | WIDOWED | WHITE | 2180-06-26 15:54:00 | 2180-06-26 21:31:00 | 0 |
2 | 10000032 | 25742920 | 2180-08-05 23:44:00 | 2180-08-07 17:50:00 | NaN | EW EMER. | P19UTS | EMERGENCY ROOM | HOSPICE | Medicaid | English | WIDOWED | WHITE | 2180-08-05 20:58:00 | 2180-08-06 01:44:00 | 0 |
3 | 10000032 | 29079034 | 2180-07-23 12:35:00 | 2180-07-25 17:55:00 | NaN | EW EMER. | P06OTX | EMERGENCY ROOM | HOME | Medicaid | English | WIDOWED | WHITE | 2180-07-23 05:54:00 | 2180-07-23 14:00:00 | 0 |
4 | 10000068 | 25022803 | 2160-03-03 23:16:00 | 2160-03-04 06:26:00 | NaN | EU OBSERVATION | P39NWO | EMERGENCY ROOM | NaN | NaN | English | SINGLE | WHITE | 2160-03-03 21:55:00 | 2160-03-04 06:26:00 | 0 |
subject_id | |
---|---|
0 | 10000032 |
1 | 10000068 |
2 | 10000084 |
3 | 10000108 |
4 | 10000117 |
subject_id | batch_num | |
---|---|---|
0 | 10000032 | 1 |
1 | 10000068 | 2 |
2 | 10000084 | 3 |
3 | 10000108 | 4 |
4 | 10000117 | 5 |
... | ... | ... |
223447 | 19999733 | 8 |
223448 | 19999784 | 1 |
223449 | 19999828 | 2 |
223450 | 19999840 | 3 |
223451 | 19999987 | 4 |
223452 rows × 2 columns
batch_num | patient_count | |
---|---|---|
0 | 1 | 27932 |
1 | 2 | 27932 |
2 | 3 | 27932 |
3 | 4 | 27932 |
4 | 5 | 27931 |
5 | 6 | 27931 |
6 | 7 | 27931 |
7 | 8 | 27931 |
The complete tutorial notebook describing the data cleaning process for procedures, medications, and laboratory data is available here.
To access the full collection of tutorial notebooks, tools, visualizations, and relevant metadata files, please register here.