Step 2: Code Roll-Up

The raw EHR codes in EHR data are typically too specific, making analysis difficult or impractical and for EHR data to be effectively compared and analyzed, they must be standardized. For example, medical codes in electronic health records (EHR) like medications codes may come from different coding systems such as NDC, RxNorm, or institution specific local codes. “Rolling up” data from these various coding systems to a common parent coding system ensures standardization. It also helps in harmonizing data across different institutions to enable analysis on a larger scale. The MIMIC database employs widely adopted clinical coding systems—including ICD (diagnoses), CPT (procedures), NDC/RxNorm (medications), and LOINC (laboratory tests)—across its four core domains of structured data. These standardized codes are directly integrated, eliminating the need for additional code mapping. To explore the hierarchical relationships and granularity of these codes, interactive visualizations are available here.

import os
import sys
import time
import logging
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
from IPython.display import display

# Set pandas options to expand all data within rows
pd.set_option('display.max_columns', None)      
pd.set_option('display.max_colwidth', None) 

# Setting up Directory to save Rolledup Data

base_directory = os.path.dirname(os.getcwd())

rolledup_intermediatedata_directory = os.path.join(base_directory, 'processed_data', 'step4_rolledup_intermediatedata')
os.makedirs(rolledup_intermediatedata_directory, exist_ok=True)
print(f"Directory created at: {rolledup_intermediatedata_directory}")

rolledup_finaldata_directory = os.path.join(base_directory, 'processed_data', 'step4_rolledup_finaldata')
os.makedirs(rolledup_finaldata_directory, exist_ok=True)
print(f"Directory created at: {rolledup_finaldata_directory}")
Directory created at: /n/scratch/users/v/va67/EHR_TUTORIAL_WORKSPACE/processed_data/step4_rolledup_intermediatedata
Directory created at: /n/scratch/users/v/va67/EHR_TUTORIAL_WORKSPACE/processed_data/step4_rolledup_finaldata

Creating Rollup Mapping Files

1. Roll up CPT to CCS

The original mapping table provides mappings in a range format (‘T10001-T10004’ -> 128). Therefore, we have to parse this format at first and then create our mapping table.

Please download the original mapping table from HCUP (https://www.hcup-us.ahrq.gov/toolssoftware/ccs_svcsproc/ccssvcproc.jsp).

import pandas as pd

def roll_cpt2ccs(output_path):
    # path of the original mapping table
    path = 'CCS_services_procedures_v2021-1.csv'
    
    # load the original mapping table
    cols = ['Code Range','CCS','CCS Label']
    type_setting = {'Code Range':str, 'CCS': int}
    original_table = pd.read_csv(path, usecols=type_setting.keys(), skiprows=1,
                dtype=type_setting, index_col=False)
    
    # the final rolling-up table
    cpt2ccs = []
    
    for line in original_table.itertuples(False):
        # [a, b] is a continuous sequence of CPTs rolled up to a CCS
        a, b = line[0].strip('\'').split('-')
        ccs_code = line[1]
        
        if a == b:      # the sequence has only one CPT
            cpt2ccs.append([a, ccs_code])
        else:           # the sequence has more than one CPTs
            a = int(a)
            b = int(b)
            for i in range(a, b+1):
                cpt2ccs.append([i, ccs_code])  
    
    # output the result to a csv file
    cpt2ccs = pd.DataFrame(cpt2ccs, columns=['cpt', 'ccs'], dtype='str')
    cpt2ccs.to_csv(output_path, index=False)
    
    return cpt2ccs


# demo of code aggregation
output_path = 'cpt2ccs_rollup.csv'
cpt2ccs = roll_cpt2ccs(output_path)

print('\nCPT-to-CCS mapppings:')
print(cpt2ccs.iloc[:10, :])

2. Roll up ICD to PheCode

Since the original mapping table provides mappings directly, we can filter wanted rows and columns without extra operations.

Please download the original mapping table from PheWAS (https://phewascatalog.org/phecodes).

import pandas as pd

def roll_icd2phe(icd9_output_path, icd10_output_path):
    ### roll up ICD-9 to PheCode
    # load the original table
    path = 'phecode_icd9_rolled.csv'
    df_icd9 = pd.read_csv(path, usecols=['ICD9', 'PheCode'],
                          dtype='str')
    df_icd9.loc[:, 'ICD9'] = df_icd9.loc[:, 'ICD9'].apply(lambda x:x.replace('.', ''))
    
    # remove empty line (NA) and duplicate ICDs
    df_icd9.dropna(inplace=True)
    df_icd9.drop_duplicates(['ICD9'], keep='first', inplace=True)
    
    # output the result
    df_icd9.to_csv(icd9_output_path, index=False)
    
    
    ### roll up ICD-10-CM to PheCode
    # load the original table
    path = 'Phecode_map_v1_2_icd10cm_beta.csv'
    df_icd10 = pd.read_csv(path, usecols=['icd10cm', 'phecode'],
                           dtype='str', encoding='unicode_escape')
    
    # remove the [.] in ICD code for MIMIC rolling up. You may not need to do so in your project
    df_icd10.loc[:, 'icd10cm'] = df_icd10.loc[:, 'icd10cm'].apply(lambda x:x.replace('.', ''))
    
    # remove empty line (NA) and duplicate ICDs
    df_icd10.dropna(inplace=True)
    df_icd10.drop_duplicates(['icd10cm'], keep='first', inplace=True)
    
    # output the result
    df_icd10.to_csv(icd10_output_path, index=False)
    
    return df_icd9, df_icd10


icd9_output_path = 'icd92phe_rollup.csv'
icd10_output_path = 'icd102phe_rollup.csv'
icd92phe, icd102phe = roll_icd2phe(icd9_output_path, icd10_output_path)

print('\nICD9-to-PheCode mapppings:')
print(icd92phe.iloc[:10, :])

print('\nICD10-to-PheCode mapppings:')
print(icd102phe.iloc[:10, :])

3. Roll up ICD-10-PCS and ICD-9-CM to CCS

For procedure ICD codes, we need two original mapping tables to aggregate them to CCS codes. These original tables provide mappings directly, so we can filter wanted rows and columns without extra operations.

Please download original mapping tables from HCUP.

(ICD-9-CM: https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp)

(ICD-10-PCS: https://www.hcup-us.ahrq.gov/toolssoftware/ccs10/ccs10.jsp)

import pandas as pd

def roll_icd10pcs2ccs(output_path):
    # load the original table
    path = 'ccs_pr_icd10pcs_2020_1.csv'
    usecols = ["'ICD-10-PCS CODE'", "'CCS CATEGORY'"]
    icd10pcs2ccs = pd.read_csv(path, usecols=usecols,
                           dtype='str', encoding='utf8', index_col=False)
    icd10pcs2ccs.rename(
        {usecols[0]: 'icd10pcs', usecols[1]: 'ccs'}, axis=1, inplace=True)
    
    # remove ['] in the table
    icd10pcs2ccs.loc[:, 'icd10pcs'] = icd10pcs2ccs.loc[:, 'icd10pcs'].apply(lambda x:x.replace('\'', '').strip())
    icd10pcs2ccs.loc[:, 'ccs'] = icd10pcs2ccs.loc[:, 'ccs'].apply(lambda x:x.replace('\'', '').strip())

    # remove empty line (NA) and duplicate ICDs
    icd10pcs2ccs.dropna(inplace=True)
    icd10pcs2ccs.drop_duplicates(['icd10pcs'], keep='first', inplace=True)
    
    # output the result
    icd10pcs2ccs.to_csv(output_path, index=False)
    
    return icd10pcs2ccs


def roll_icd9cm2ccs(output_path):
    # load the original table
    path = '$prref 2015.csv'
    usecols = ["'ICD-9-CM CODE'", "'CCS CATEGORY'"]
    icd9cm2ccs = pd.read_csv(path, usecols=usecols, skiprows=1,
                           dtype='str', encoding='utf8', index_col=False)
    icd9cm2ccs.rename(
        {usecols[0]: 'icd9cm', usecols[1]: 'ccs'}, axis=1, inplace=True)
    
    # remove ['] in the table
    icd9cm2ccs.loc[:, 'icd9cm'] = icd9cm2ccs.loc[:, 'icd9cm'].apply(lambda x:x.replace('\'', '').strip())
    icd9cm2ccs.loc[:, 'ccs'] = icd9cm2ccs.loc[:, 'ccs'].apply(lambda x:x.replace('\'', '').strip())

    # remove empty line (NA) and duplicate ICDs
    icd9cm2ccs.dropna(inplace=True)
    icd9cm2ccs = icd9cm2ccs[icd9cm2ccs['ccs'] != '0']
    icd9cm2ccs.drop_duplicates(['icd9cm'], keep='first', inplace=True)
    
    # output the result
    icd9cm2ccs.to_csv(output_path, index=False)
    
    return icd9cm2ccs


output_path = 'icd10pcs2ccs_rollup.csv'
icd10pcs2ccs = roll_icd10pcs2ccs(output_path)
print('\nICD10PCS-to-CCS mapppings:')
print(icd10pcs2ccs.iloc[:10, :])

output_path = 'icd9cm2ccs_rollup.csv'
icd9cm2ccs = roll_icd9cm2ccs(output_path)
print('\nICD9CM-to-CCS mapppings:')
print(icd9cm2ccs.iloc[:10, :])

4. Roll up NDC to RxNorm

To roll up NDC codes to RxNorm codes, we have to map RxNorm codes to their ingredients and then aggregate NDC to RxNorm. First, we need an ingredient mapping table to map all RxNorms to their ingredient. Here, we provide this mapping tables directly.

Then, in function roll_ndc2rxnorm, we make use of both obsolete mappings and non-obsolete ones in “RXSAT.RRF”. The reason we include obsolete mappings is that MIMIC data are collected between 2008 to 2019, and these obsolete mappings might be helpful to code aggregation. Because RxNorm presents the relation between NDC and RxNorm explicitly, we do not need to do extra operations except adding obsolete mappings and mapping RxNorm ingredients.

Please download RxNorm files from their official website (https://www.nlm.nih.gov/research/umls/rxnorm/docs/rxnormfiles.html).

import pandas as pd

def roll_ndc2rxnorm(ingredient_path, output_path):
    # load the original table
    print('Generating NDC-to-RxNorm mapping table...')
    
    path = 'rxnorm/rrf/RXNSAT.RRF'
    cols = ['RXCUI','LUI','SUI','RXAUI','STYPE','CODE','ATUI','SATUI','ATN','SAB','ATV','SUPPRESS','CVF']
    setting = {'RXCUI': int, 'SUPPRESS': str, 'ATN':str,'SAB':str,'ATV':str}
    table = pd.read_csv(path, names=cols, usecols=setting.keys(), sep='|',
            dtype=setting, index_col=False)
    print('total length', table.shape[0])
    
    # get rows with NDC information
    table = table.loc[(table['ATN'] == 'NDC')]
    print('num of CUI', table['RXCUI'].drop_duplicates(keep='first', inplace=False).shape[0])
    
    # separate obsolete and non-obsolete mappings
    valid = (table['SAB'] == 'RXNORM') & (table['SUPPRESS'] == 'N')
    table_obsolete = table.loc[~valid]
    table = table.loc[valid]
    print('valid table size', table.shape[0])
    
    # number of non-obsolete mappings
    num = table['RXCUI'].drop_duplicates(keep='first', inplace=False)
    print('num of valid CUI', num.shape[0])
    
    # get obsolete mappings
    table_obsolete = table_obsolete.loc[(table_obsolete['ATV'].str.len() == 11) & table_obsolete['ATV'].str.isdigit()] 
    print('obsolete table size', table_obsolete.shape[0])
    # number of obsolete mappings
    num = table_obsolete['RXCUI'].drop_duplicates(keep='first', inplace=False)
    print('num of obsolete CUI', num.shape[0])
    
    # merge obsolete and non-obsolete mappings
    table = table.append(table_obsolete)
    table.drop_duplicates(['ATV'], keep='first', inplace=True)
    print('final size', table.shape[0])
    print('final CUIs', table['RXCUI'].drop_duplicates(keep='first', inplace=False).shape[0])

    # map CUIs to their ingredients
    ingredient = pd.read_csv(ingredient_path, usecols=['base','ingredient'],
            dtype=setting, index_col=False)
    
    # update CUIs
    dic = {line[0]:line[1] for line in ingredient.itertuples(False)}
    table['RXCUI'] = table['RXCUI'].apply(lambda x:dic.get(x, x))
    
    # output the final result
    table = table.loc[:, ['ATV', 'RXCUI']]
    table.drop_duplicates(keep='first', inplace=True)
    table.to_csv(output_path, sep=',', columns=['ATV', 'RXCUI'], index=False, header=['ndc', 'rxcui'])
    
    return table

ingredient_path = 'rxnorm/ingredient.csv'
output_path = 'ndc2rxnorm_rollup.csv'
ndc2rxnorm = roll_ndc2rxnorm(ingredient_path, output_path)

print('\nNDC-to-RxNorm mapppings:')
print(ndc2rxnorm.iloc[:10, :])

Once the the rollup files are generated, you can go ahead and map the low level codes to parent level group codes. We will start with rolling up Diagnoses codes to Phecodes.

Example: Rolling up Diagnosis Codes

ICD codes are too granular and often differentiating between minor variations of a disease might not be suitable for broad research analyses and can lead to sparse data issues. Phecodes solves this problem by grouping relevant ICD codes into clinical meaningful phenotypes.

  • Rolled up to: PheCodes Use resources like the PheWAS Catalog and the PheWAS hierarchy to map diagnosis codes (e.g., ICD-9/10) to PheCodes. Enables grouping related diagnoses into clinically meaningful categories.

Screenshot 2025-02-10 at 2.52.42 PM.png
base_directory = os.path.dirname(os.getcwd())

icd_to_phecode_file = os.path.join(base_directory, 'scripts', 'rollup_mappings',"icd_to_phecode.csv")
icd_to_phecode = pd.read_csv(icd_to_phecode_file, dtype=str)
display(icd_to_phecode.head())
code PheCode coding_system
0 001 008 ICD9
1 0010 008 ICD9
2 0011 008 ICD9
3 0019 008 ICD9
4 002 008 ICD9
# We will select a sample diagnoses file for rolling up

base_directory = os.path.dirname(os.getcwd())

diagnoses_cleaned_rawdata = os.path.join(base_directory, 'processed_data', 'step3_cleaned_rawdata',"Diagnoses")
diagnoses_files = os.listdir(diagnoses_cleaned_rawdata)

sample_diagnoses_filepath= os.path.join(diagnoses_cleaned_rawdata, diagnoses_files[0])
sample_diagnoses = pd.read_csv(sample_diagnoses_filepath, dtype=str)
display(sample_diagnoses.head())
subject_id date code coding_system
0 10000032 2180-05-06 5723 ICD9
1 10000032 2180-05-06 78959 ICD9
2 10000032 2180-05-06 5715 ICD9
3 10000032 2180-05-06 07070 ICD9
4 10000032 2180-05-06 496 ICD9

Now, if you observe the rollup mapping file and the actual diagnoses file, you will notice a mismatch in the column names. To perform the rollup using a join operation, you will need to make the column names consistent. You can do this by renaming the columns in the rollup file.

# Merging the two tables to rollup/map icd code to phecode. Save this rolled up data in intermediate folder. 
# In future if you update rollup mapping to be more comprehensive or if you want to look at codes that are unmapped, you can always come back.
    
sample_diagnoses_phecode = pd.merge(sample_diagnoses, icd_to_phecode, how='left', on=['code','coding_system'])

sample_diagnoses_phecode['Rollup_Status'] = sample_diagnoses_phecode['PheCode'].notna().replace({True: '1', False: '0'})

display(sample_diagnoses_phecode.head())
print(sample_diagnoses_phecode.shape)
subject_id date code coding_system PheCode Rollup_Status
0 10000032 2180-05-06 5723 ICD9 571.81 1
1 10000032 2180-05-06 78959 ICD9 572 1
2 10000032 2180-05-06 5715 ICD9 571.51 1
3 10000032 2180-05-06 07070 ICD9 070.3 1
4 10000032 2180-05-06 496 ICD9 496 1
(799778, 6)
# Unrolled rows (where ICD codes were not rolled up)

sample_diagnoses_unrolled = sample_diagnoses_phecode[sample_diagnoses_phecode["Rollup_Status"]=="0"]
display(sample_diagnoses_unrolled.head())
print(sample_diagnoses_unrolled.shape)
subject_id date code coding_system PheCode Rollup_Status
34 10000032 2180-07-23 V4986 ICD9 NaN 0
44 10001319 2135-07-20 V270 ICD9 NaN 0
46 10001319 2138-11-09 V270 ICD9 NaN 0
48 10001319 2134-04-15 V270 ICD9 NaN 0
52 10001843 2131-11-09 Y840 ICD10 NaN 0
(52416, 6)
# Summarize the codes that have not been rolledup

unique_subject_icd_pairs = sample_diagnoses_unrolled[['subject_id', 'code','coding_system']].drop_duplicates()

icdcode_frequencies = unique_subject_icd_pairs[['code','coding_system']].value_counts().reset_index(name='counts')

sorted_icdcode_frequencies = icdcode_frequencies.rename(columns={'index': 'code'}).sort_values(by='counts', ascending=False)

display(sorted_icdcode_frequencies.head(10))
code coding_system counts
0 Z20822 ICD10 3009
1 Y929 ICD10 1956
2 Y92230 ICD10 1115
3 V270 ICD9 1086
4 V4986 ICD9 1057
5 Y92009 ICD10 901
6 Y92239 ICD10 898
7 E8497 ICD9 867
8 E8490 ICD9 743
9 E8788 ICD9 694

Once the data looks reasonable, with good enough rollup done, you can save the data. You can save the comprehensive data with rolled and unrolled info into the intermidate_data folder. You can come back to this if you need to check anything in the future.

You can save the rolled up file under rolleddup_data.

We don’t really need all the columns after rollup is performed. Below we just keep the data we need.

sample_diagnoses_phecode_filtered = sample_diagnoses_phecode[sample_diagnoses_phecode['Rollup_Status']=="1"]

print(sample_diagnoses_phecode_filtered )
sample_diagnoses_phecode_filtered  = sample_diagnoses_phecode_filtered [['subject_id','PheCode','date']]
sample_diagnoses_phecode_filtered 
       subject_id        date    code coding_system PheCode Rollup_Status
0        10000032  2180-05-06    5723          ICD9  571.81             1
1        10000032  2180-05-06   78959          ICD9     572             1
2        10000032  2180-05-06    5715          ICD9  571.51             1
3        10000032  2180-05-06   07070          ICD9   070.3             1
4        10000032  2180-05-06     496          ICD9     496             1
...           ...         ...     ...           ...     ...           ...
799773   19999784  2121-01-31   Z5111         ICD10    1010             1
799774   19999784  2121-01-31   C8589         ICD10   202.2             1
799775   19999784  2121-01-31    E876         ICD10  276.14             1
799776   19999784  2121-01-31  Z87891         ICD10     318             1
799777   19999784  2121-01-31   Z8619         ICD10     136             1

[747362 rows x 6 columns]
subject_id PheCode date
0 10000032 571.81 2180-05-06
1 10000032 572 2180-05-06
2 10000032 571.51 2180-05-06
3 10000032 070.3 2180-05-06
4 10000032 496 2180-05-06
... ... ... ...
799773 19999784 1010 2121-01-31
799774 19999784 202.2 2121-01-31
799775 19999784 276.14 2121-01-31
799776 19999784 318 2121-01-31
799777 19999784 136 2121-01-31

747362 rows × 3 columns

if sample_diagnoses_phecode_filtered.duplicated().sum() > 0:
    print("Duplicate rows found. Removing duplicates...")
    sample_diagnoses_phecode_filtered = sample_diagnoses_phecode_filtered.drop_duplicates()  # Remove duplicate rows
    print("DataFrame after removing duplicates:")
else:
    print("No duplicate rows found.")

display(sample_diagnoses_phecode_filtered)
Duplicate rows found. Removing duplicates...
DataFrame after removing duplicates:
subject_id PheCode date
0 10000032 571.81 2180-05-06
1 10000032 572 2180-05-06
2 10000032 571.51 2180-05-06
3 10000032 070.3 2180-05-06
4 10000032 496 2180-05-06
... ... ... ...
799773 19999784 1010 2121-01-31
799774 19999784 202.2 2121-01-31
799775 19999784 276.14 2121-01-31
799776 19999784 318 2121-01-31
799777 19999784 136 2121-01-31

714287 rows × 3 columns

For the full notebook with code, please visit here.