Module src.data_pecarn
Expand source code
import os
from os.path import join as oj
import numpy as np
import pandas as pd
from tqdm import tqdm
import data
from config import PROCESSED_DIR, PECARN_DIR
NUM_PATIENTS = 12044
def get_data(use_processed=False, frac_missing_allowed=0.05,
processed_file=oj(PROCESSED_DIR, 'df_pecarn.pkl'), dummy=False,
impute_feats=True):
'''Run all the preprocessing
Params
------
use_processed: bool, optional
determines whether to load df from cached pkl (only for reading from the csv)
save_processed: bool, optional
if not using processed, determines whether to save the df
'''
if use_processed and os.path.exists(processed_file):
return pd.read_pickle(processed_file)
else:
df_features = get_features() # read all features into df
df_outcomes = get_outcomes() # 2 outcomes: iai, and iai_intervention
df = pd.merge(df_features, df_outcomes, on='id', how='left')
df = rename_values(df) # rename the features by their meaning
df = data.derived_feats(df)
# drop cols with vals missing this percent of the time
df = df.dropna(axis=1, thresh=(1 - frac_missing_allowed) * NUM_PATIENTS)
# delete repeat columns
'''
keys = list(df.keys())
keys_to_remove = [k for k in keys if 'Repeat_instance' in k]
df = df.drop(labels=keys_to_remove, axis=1)
'''
if impute_feats:
df = impute(df) # impute and fill
df = data.add_cv_split(df, dset='pecarn')
if dummy:
df = data.to_dummies(df)
df['dset'] = 'pecarn'
# save
os.makedirs(os.path.dirname(processed_file), exist_ok=True)
df.to_pickle(processed_file)
df.to_csv(processed_file[:-4] + '.csv')
unit_test(df)
return df
def get_features(processed_file=oj(PROCESSED_DIR, 'df_pecarn_features.pkl')):
'''Read all features into df
Returns
-------
features: pd.DataFrame
'''
if os.path.exists(processed_file):
return pd.read_pickle(processed_file)
# all the fnames to be loaded and searched over
fnames = sorted([fname for fname in os.listdir(PECARN_DIR)
if 'csv' in fname
and not 'formats' in fname
and not 'form6' in fname]) # remove outcome
# feature_names = [fname[:-4].replace('form', '').replace('-', '_') for fname in fnames]
# demographics = pd.read_csv('iaip_data/Datasets/demographics.csv')
# read through each fname and save into the r dictionary
r = {}
print('read all the csvs...')
for fname in tqdm(fnames):
df = pd.read_csv(oj(PECARN_DIR, fname), engine='python')
df.rename(columns={'SubjectID': 'id'}, inplace=True)
df.rename(columns={'subjectid': 'id'}, inplace=True)
assert ('id' in df.keys())
r[fname] = df
# loop over the relevant forms and merge into one big df
fnames_small = [fname for fname in fnames
if 'form1' in fname
or 'form2' in fname
or 'form4' in fname
or 'form5' in fname
or 'form7' in fname
]
df = r[fnames[0]]
print('merge all the dfs...')
for i, fname in tqdm(enumerate(fnames_small)):
df2 = r[fname].copy()
# if subj has multiple entries, only keep first
df2 = df2.drop_duplicates(subset=['id'], keep='last')
'''
# possibly rename the columns to include form number
rename_dict = {
key: key + '_' + fname[:-4].replace('form', '')
for key in df2.keys()
if not key == 'id'
}
df2.rename(columns=rename_dict, inplace=True)
'''
# don't save duplicate columns
df = df.set_index('id').combine_first(df2.set_index('id')).reset_index()
# save to pickle
os.makedirs(os.path.dirname(processed_file), exist_ok=True)
df.to_pickle(processed_file)
return df
def get_outcomes():
"""Read in the outcomes
Returns
-------
outcomes: pd.DataFrame
iai (has 761 positives)
iai_intervention (has 203 positives)
"""
form4abdangio = pd.read_csv(oj(PECARN_DIR, 'form4bother_abdangio.csv')).rename(columns={'subjectid': 'id'})
# form6a = pd.read_csv(oj(PECARN_DIR, 'form6a.csv')).rename(columns={'subjectid': 'id'})
form6b = pd.read_csv(oj(PECARN_DIR, 'form6b.csv')).rename(columns={'SubjectID': 'id'})
form6c = pd.read_csv(oj(PECARN_DIR, 'form6c.csv')).rename(columns={'subjectid': 'id'})
# (6b) Intra-abdominal injury diagnosed in the ED/during hospitalization by any diagnostic method
# 1 is yes, 761 have intra-abdominal injury
# 2 is no -> remap to 0, 841 without intra-abdominal injury
def get_ids(form, keys):
'''Returns ids for which any of the keys is 1
'''
ids_all = set()
for key in keys:
ids = form.id.values[form[key] == 1]
for i in ids:
ids_all.add(i)
return ids_all
ids_iai = get_ids(form6b, ['IAIinED1']) # form6b.id[form6b['IAIinED1'] == 1]
# print(form4abdangio.keys())
ids_allangio = get_ids(form4abdangio, ['AbdAngioVessel'])
# print('num in 4angio', len(ids_allangio))
# print(form6a.keys())
# ids_alla = get_ids(form6a, ['DeathCause'])
# print('num in a', len(ids_alla))
# print(form6b.keys())
ids_allb = get_ids(form6b, ['IVFluids', 'BldTransfusion'])
# print('num in b', len(ids_allb))
# print(form6c.keys())
ids_allc = get_ids(form6c, ['IntervenDurLap'])
# print('num in c', len(ids_allc))
ids = ids_allb.union(ids_allangio).union(ids_allc)
ids_iai_np = np.array(list(ids_iai)) - 1
ids_np = np.array(list(ids)) - 1
iai = np.zeros(NUM_PATIENTS).astype(np.int)
iai[ids_iai_np] = 1
iai_intervention = np.zeros(NUM_PATIENTS).astype(np.int)
iai_intervention[ids_np] = 1
df_iai = pd.DataFrame.from_dict({
'id': np.arange(1, NUM_PATIENTS + 1),
'iai': iai,
'iai_intervention': iai_intervention
})
return df_iai
def rename_values(df):
'''Map values to meanings
Rename some features
Compute a couple new features
set types of
'''
# map categorical vars values
race = {
1: 'American Indian or Alaska Native',
2: 'Asian',
3: 'Black or African American',
4: 'Native Hawaiian or other Pacific Islander',
5: 'White',
6: 'unknown', # stated as unknown
7: 'unknown' # other
}
df.RACE = df.RACE.map(race)
moi = {
1: 'Motor vehicle collision',
2: 'Fall from an elevation',
3: 'Fall down stairs',
4: 'Pedestrian/bicyclist struck by moving vehicle',
5: 'Bike collision/fall',
6: 'Motorcycle/ATV/Scooter collision',
7: 'Object struck abdomen',
8: 'unknown', # unknown mechanism,
9: 'unknown', # other mechanism
10: 'unknown' # physician did not answer
}
df['MOI'] = df.RecodedMOI.map(moi)
df = df.drop(columns=['RecodedMOI'])
abdTenderDegree = {
1: 'Mild',
2: 'Moderate',
3: 'Severe',
4: 'unknown',
np.nan: 'unknown'
}
# combine aggregate gcs into total gcs
idxs_to_replace = ~df['AggregateGCS'].isna() & df['GCSScore'].isna()
df.loc[idxs_to_replace, 'GCSScore'] = df['AggregateGCS'][idxs_to_replace]
# print(np.unique(df['AbdTenderDegree'], return_counts=True))
df['AbdTenderDegree'] = df.AbdTenderDegree.map(abdTenderDegree)
# print(np.unique(df['AbdTenderDegree'], return_counts=True))
binary = {
0: 'no',
1: 'yes',
False: 'no',
True: 'yes',
'unknown': 'unknown'
}
df['HISPANIC_ETHNICITY'] = (df['HISPANIC_ETHNICITY'] == '-1').map(
binary) # note: -1 is Hispanic (0 is not, 1 is unknown)
# rename variables
df = df.rename(columns={'RACE': 'Race_orig',
'SEX': 'Sex',
'HISPANIC_ETHNICITY': 'Hispanic',
'ageinyrs': 'Age'
})
# set types of these variables to categorical
ks_categorical = ['Sex', 'Race_orig', 'Hispanic',
'VomitWretch', 'MOI', 'ThoracicTender', 'ThoracicTrauma',
'DecrBreathSound', 'AbdDistention', 'AbdTenderDegree',
'AbdTrauma', 'SeatBeltSign', 'DistractingPain',
'AbdomenPain', 'AbdomenTender']
for k in ks_categorical:
df[k] = df[k].astype(str)
df['AbdomenPain'] = df['AbdomenPain'].replace('3.0', 'other')
df['CTScan'] = (df['AbdCTScan'] == 1.0).astype(int)
# remap values which take on values 0....4
ks_remap = ['VomitWretch',
'ThoracicTender', 'ThoracicTrauma',
'DecrBreathSound', 'AbdDistention',
'AbdTrauma', 'SeatBeltSign',
'DistractingPain', 'AbdomenPain', 'AbdomenTender']
for k in ks_remap:
vals = df[k].values
is_na = df[k].isna()
uniques = np.unique(vals).astype(np.str)
contains_nan = np.sum(is_na) > 0
if contains_nan and uniques.size in [4, 5] or ~contains_nan and uniques.size in [3, 4]:
if '1.0' in uniques and '2.0' in uniques and ('3.0' in uniques or 'other' in uniques):
df[k] = df[k].map({
'1.0': 'yes',
'2.0': 'no',
'3.0': 'unknown',
'4.0': 'unknown',
'other': 'other',
np.nan: 'unknown',
})
return df
def impute(df: pd.DataFrame):
"""Returns df with imputed features.
Note: lots of things have filled na with "unknown"
"""
# fill in values for some vars from unknown -> None
df.loc[df['AbdomenTender'].isin(['no', 'unknown']), 'AbdTenderDegree'] = 'None'
# pandas impute missing values with median
df = df.fillna(df.median())
df.GCSScore = df.GCSScore.fillna(df.GCSScore.median())
return df
def unit_test(df):
assert df.shape[0] == 12044, 'should have 12044 patients'
assert np.sum(df['iai_intervention']) == 203, 'should have 203 patients IWI'
Functions
def get_data(use_processed=False, frac_missing_allowed=0.05, processed_file='../data/processed/df_pecarn.pkl', dummy=False, impute_feats=True)
-
Run all the preprocessing
Params
use_processed
:bool
, optional- determines whether to load df from cached pkl (only for reading from the csv)
save_processed
:bool
, optional- if not using processed, determines whether to save the df
Expand source code
def get_data(use_processed=False, frac_missing_allowed=0.05, processed_file=oj(PROCESSED_DIR, 'df_pecarn.pkl'), dummy=False, impute_feats=True): '''Run all the preprocessing Params ------ use_processed: bool, optional determines whether to load df from cached pkl (only for reading from the csv) save_processed: bool, optional if not using processed, determines whether to save the df ''' if use_processed and os.path.exists(processed_file): return pd.read_pickle(processed_file) else: df_features = get_features() # read all features into df df_outcomes = get_outcomes() # 2 outcomes: iai, and iai_intervention df = pd.merge(df_features, df_outcomes, on='id', how='left') df = rename_values(df) # rename the features by their meaning df = data.derived_feats(df) # drop cols with vals missing this percent of the time df = df.dropna(axis=1, thresh=(1 - frac_missing_allowed) * NUM_PATIENTS) # delete repeat columns ''' keys = list(df.keys()) keys_to_remove = [k for k in keys if 'Repeat_instance' in k] df = df.drop(labels=keys_to_remove, axis=1) ''' if impute_feats: df = impute(df) # impute and fill df = data.add_cv_split(df, dset='pecarn') if dummy: df = data.to_dummies(df) df['dset'] = 'pecarn' # save os.makedirs(os.path.dirname(processed_file), exist_ok=True) df.to_pickle(processed_file) df.to_csv(processed_file[:-4] + '.csv') unit_test(df) return df
def get_features(processed_file='../data/processed/df_pecarn_features.pkl')
-
Read all features into df
Returns
features
:pd.DataFrame
Expand source code
def get_features(processed_file=oj(PROCESSED_DIR, 'df_pecarn_features.pkl')): '''Read all features into df Returns ------- features: pd.DataFrame ''' if os.path.exists(processed_file): return pd.read_pickle(processed_file) # all the fnames to be loaded and searched over fnames = sorted([fname for fname in os.listdir(PECARN_DIR) if 'csv' in fname and not 'formats' in fname and not 'form6' in fname]) # remove outcome # feature_names = [fname[:-4].replace('form', '').replace('-', '_') for fname in fnames] # demographics = pd.read_csv('iaip_data/Datasets/demographics.csv') # read through each fname and save into the r dictionary r = {} print('read all the csvs...') for fname in tqdm(fnames): df = pd.read_csv(oj(PECARN_DIR, fname), engine='python') df.rename(columns={'SubjectID': 'id'}, inplace=True) df.rename(columns={'subjectid': 'id'}, inplace=True) assert ('id' in df.keys()) r[fname] = df # loop over the relevant forms and merge into one big df fnames_small = [fname for fname in fnames if 'form1' in fname or 'form2' in fname or 'form4' in fname or 'form5' in fname or 'form7' in fname ] df = r[fnames[0]] print('merge all the dfs...') for i, fname in tqdm(enumerate(fnames_small)): df2 = r[fname].copy() # if subj has multiple entries, only keep first df2 = df2.drop_duplicates(subset=['id'], keep='last') ''' # possibly rename the columns to include form number rename_dict = { key: key + '_' + fname[:-4].replace('form', '') for key in df2.keys() if not key == 'id' } df2.rename(columns=rename_dict, inplace=True) ''' # don't save duplicate columns df = df.set_index('id').combine_first(df2.set_index('id')).reset_index() # save to pickle os.makedirs(os.path.dirname(processed_file), exist_ok=True) df.to_pickle(processed_file) return df
def get_outcomes()
-
Read in the outcomes
Returns
outcomes
:pd.DataFrame
- iai (has 761 positives) iai_intervention (has 203 positives)
Expand source code
def get_outcomes(): """Read in the outcomes Returns ------- outcomes: pd.DataFrame iai (has 761 positives) iai_intervention (has 203 positives) """ form4abdangio = pd.read_csv(oj(PECARN_DIR, 'form4bother_abdangio.csv')).rename(columns={'subjectid': 'id'}) # form6a = pd.read_csv(oj(PECARN_DIR, 'form6a.csv')).rename(columns={'subjectid': 'id'}) form6b = pd.read_csv(oj(PECARN_DIR, 'form6b.csv')).rename(columns={'SubjectID': 'id'}) form6c = pd.read_csv(oj(PECARN_DIR, 'form6c.csv')).rename(columns={'subjectid': 'id'}) # (6b) Intra-abdominal injury diagnosed in the ED/during hospitalization by any diagnostic method # 1 is yes, 761 have intra-abdominal injury # 2 is no -> remap to 0, 841 without intra-abdominal injury def get_ids(form, keys): '''Returns ids for which any of the keys is 1 ''' ids_all = set() for key in keys: ids = form.id.values[form[key] == 1] for i in ids: ids_all.add(i) return ids_all ids_iai = get_ids(form6b, ['IAIinED1']) # form6b.id[form6b['IAIinED1'] == 1] # print(form4abdangio.keys()) ids_allangio = get_ids(form4abdangio, ['AbdAngioVessel']) # print('num in 4angio', len(ids_allangio)) # print(form6a.keys()) # ids_alla = get_ids(form6a, ['DeathCause']) # print('num in a', len(ids_alla)) # print(form6b.keys()) ids_allb = get_ids(form6b, ['IVFluids', 'BldTransfusion']) # print('num in b', len(ids_allb)) # print(form6c.keys()) ids_allc = get_ids(form6c, ['IntervenDurLap']) # print('num in c', len(ids_allc)) ids = ids_allb.union(ids_allangio).union(ids_allc) ids_iai_np = np.array(list(ids_iai)) - 1 ids_np = np.array(list(ids)) - 1 iai = np.zeros(NUM_PATIENTS).astype(np.int) iai[ids_iai_np] = 1 iai_intervention = np.zeros(NUM_PATIENTS).astype(np.int) iai_intervention[ids_np] = 1 df_iai = pd.DataFrame.from_dict({ 'id': np.arange(1, NUM_PATIENTS + 1), 'iai': iai, 'iai_intervention': iai_intervention }) return df_iai
def impute(df)
-
Returns df with imputed features. Note: lots of things have filled na with "unknown"
Expand source code
def impute(df: pd.DataFrame): """Returns df with imputed features. Note: lots of things have filled na with "unknown" """ # fill in values for some vars from unknown -> None df.loc[df['AbdomenTender'].isin(['no', 'unknown']), 'AbdTenderDegree'] = 'None' # pandas impute missing values with median df = df.fillna(df.median()) df.GCSScore = df.GCSScore.fillna(df.GCSScore.median()) return df
def rename_values(df)
-
Map values to meanings Rename some features Compute a couple new features set types of
Expand source code
def rename_values(df): '''Map values to meanings Rename some features Compute a couple new features set types of ''' # map categorical vars values race = { 1: 'American Indian or Alaska Native', 2: 'Asian', 3: 'Black or African American', 4: 'Native Hawaiian or other Pacific Islander', 5: 'White', 6: 'unknown', # stated as unknown 7: 'unknown' # other } df.RACE = df.RACE.map(race) moi = { 1: 'Motor vehicle collision', 2: 'Fall from an elevation', 3: 'Fall down stairs', 4: 'Pedestrian/bicyclist struck by moving vehicle', 5: 'Bike collision/fall', 6: 'Motorcycle/ATV/Scooter collision', 7: 'Object struck abdomen', 8: 'unknown', # unknown mechanism, 9: 'unknown', # other mechanism 10: 'unknown' # physician did not answer } df['MOI'] = df.RecodedMOI.map(moi) df = df.drop(columns=['RecodedMOI']) abdTenderDegree = { 1: 'Mild', 2: 'Moderate', 3: 'Severe', 4: 'unknown', np.nan: 'unknown' } # combine aggregate gcs into total gcs idxs_to_replace = ~df['AggregateGCS'].isna() & df['GCSScore'].isna() df.loc[idxs_to_replace, 'GCSScore'] = df['AggregateGCS'][idxs_to_replace] # print(np.unique(df['AbdTenderDegree'], return_counts=True)) df['AbdTenderDegree'] = df.AbdTenderDegree.map(abdTenderDegree) # print(np.unique(df['AbdTenderDegree'], return_counts=True)) binary = { 0: 'no', 1: 'yes', False: 'no', True: 'yes', 'unknown': 'unknown' } df['HISPANIC_ETHNICITY'] = (df['HISPANIC_ETHNICITY'] == '-1').map( binary) # note: -1 is Hispanic (0 is not, 1 is unknown) # rename variables df = df.rename(columns={'RACE': 'Race_orig', 'SEX': 'Sex', 'HISPANIC_ETHNICITY': 'Hispanic', 'ageinyrs': 'Age' }) # set types of these variables to categorical ks_categorical = ['Sex', 'Race_orig', 'Hispanic', 'VomitWretch', 'MOI', 'ThoracicTender', 'ThoracicTrauma', 'DecrBreathSound', 'AbdDistention', 'AbdTenderDegree', 'AbdTrauma', 'SeatBeltSign', 'DistractingPain', 'AbdomenPain', 'AbdomenTender'] for k in ks_categorical: df[k] = df[k].astype(str) df['AbdomenPain'] = df['AbdomenPain'].replace('3.0', 'other') df['CTScan'] = (df['AbdCTScan'] == 1.0).astype(int) # remap values which take on values 0....4 ks_remap = ['VomitWretch', 'ThoracicTender', 'ThoracicTrauma', 'DecrBreathSound', 'AbdDistention', 'AbdTrauma', 'SeatBeltSign', 'DistractingPain', 'AbdomenPain', 'AbdomenTender'] for k in ks_remap: vals = df[k].values is_na = df[k].isna() uniques = np.unique(vals).astype(np.str) contains_nan = np.sum(is_na) > 0 if contains_nan and uniques.size in [4, 5] or ~contains_nan and uniques.size in [3, 4]: if '1.0' in uniques and '2.0' in uniques and ('3.0' in uniques or 'other' in uniques): df[k] = df[k].map({ '1.0': 'yes', '2.0': 'no', '3.0': 'unknown', '4.0': 'unknown', 'other': 'other', np.nan: 'unknown', }) return df
def unit_test(df)
-
Expand source code
def unit_test(df): assert df.shape[0] == 12044, 'should have 12044 patients' assert np.sum(df['iai_intervention']) == 203, 'should have 203 patients IWI'