Module src.data
Expand source code
# directories
import numpy as np
import pandas as pd
import data_pecarn
import data_psrc
pecarn_train_idxs = [1, 2, 3, 4]
pecarn_test_idxs = [5, 6]
psrc_train_idxs = [8, 9, 10, 11]
psrc_test_idxs = [12, 13]
# common feats
feats_numerical = ['InitSysBPRange', 'InitHeartRate', 'GCSScore', 'Age']
feats_categorical = ['AbdTenderDegree', 'Race', 'MOI']
meta = ['iai_intervention', 'cv_fold', 'dset']
outcome_def = 'iai_intervention' # output
def load_it_all(dummy=True, impute=True, frac_missing_allowed=0.1):
df_pecarn = data_pecarn.get_data(use_processed=False,
frac_missing_allowed=frac_missing_allowed,
dummy=dummy,
impute_feats=impute)
all_feats_pecarn, filtered_feats_pecarn = get_feat_names(df_pecarn)
try:
df_psrc = data_psrc.get_data(use_processed=False, dummy=dummy, impute_feats=impute)
all_feats_psrc, filtered_feats_psrc = get_feat_names(df_psrc)
common_feats = meta + list(filtered_feats_pecarn.intersection(filtered_feats_psrc))
except:
print('PSRC data not loaded (not public)')
df_psrc = df_pecarn[df_pecarn.cv_fold > 100] # select 0 rows
filtered_feats_psrc = None
common_feats = ['AbdDistention_or_AbdomenPain_yes',
'AbdTenderDegree_None',
'AbdTrauma_or_SeatBeltSign_yes',
'Age<2_yes',
'CostalTender_yes',
'DecrBreathSound_yes',
'GCSScore_Full_yes',
'Hypotension_yes',
'MOI_Bike collision/fall',
'MOI_Fall from an elevation',
'MOI_Motor vehicle collision',
'MOI_Motorcycle/ATV/Scooter collision',
'MOI_Object struck abdomen',
'MOI_Pedestrian/bicyclist struck by moving vehicle',
'ThoracicTrauma_yes',
'VomitWretch_yes'] + meta
feats_binary = [feat for feat in common_feats
if not feat in feats_numerical + feats_categorical + meta]
return df_pecarn, df_psrc, common_feats, filtered_feats_pecarn, filtered_feats_psrc
def to_dummies(df: pd.DataFrame):
"""Prepare the data for classification
"""
# convert feats to dummy
df = pd.get_dummies(df, dummy_na=True) # treat na as a separate category
# remove any col that is all 0s
df = df.loc[:, (df != 0).any(axis=0)]
return df
def derived_feats(df):
'''Add derived features
'''
binary = {
0: 'no',
1: 'yes',
False: 'no',
True: 'yes',
'unknown': 'unknown'
}
df['AbdTrauma_or_SeatBeltSign'] = ((df.AbdTrauma == 'yes') | (df.SeatBeltSign == 'yes')).map(binary)
df['AbdDistention_or_AbdomenPain'] = ((df.AbdDistention == 'AbdomenPain') | (df.SeatBeltSign == 'yes')).map(binary)
df['Hypotension'] = (df['Age'] < 1 / 12) & (df['InitSysBPRange'] < 70) | \
(df['Age'] >= 1 / 12) & (df['Age'] < 5) & (df['InitSysBPRange'] < 80) | \
(df['Age'] >= 5) & (df['InitSysBPRange'] < 90)
df['Hypotension'] = df['Hypotension'].map(binary)
df['GCSScore_Full'] = (df['GCSScore'] == 15).map(binary)
df['Age<2'] = (df['Age'] < 2).map(binary)
df['CostalTender'] = ((df.LtCostalTender == 1) | (df.RtCostalTender == 1)).map(binary) # | (df.DecrBreathSound)
# Combine hispanic as part of race
df['Race'] = df['Race_orig']
df.loc[df.Hispanic == 'yes', 'Race'] = 'Hispanic'
df.loc[df.Race == 'White', 'Race'] = 'White (Non-Hispanic)'
return df
def remove_from_list(l, removes):
'''deletes all elements in removes from the list l and returns
'''
return [x for x in l
if not x in removes]
def select_final_feats(feat_names,
collapse_abd_tender=True,
collapse_abd_distention=True,
collapse_age=True):
'''Return an interpretable set of the best features
'''
feat_names = [f for f in feat_names
if not f in meta
and not f.endswith('_no')
and not 'Race' in f
and not 'other' in f.lower()
and not 'unknown' in f.lower()
]
feat_names = remove_from_list(feat_names, ['LtCostalTender', 'RtCostalTender'])
feat_names = remove_from_list(feat_names, ['AbdTrauma_yes', 'SeatBeltSign_yes'])
feat_names = remove_from_list(feat_names, ['GCSScore'])
feat_names = remove_from_list(feat_names, ['InitHeartRate', 'InitSysBPRange']) # remove these so we can only have binary vars
# make abd tender into a None or not-None variable
if collapse_abd_tender:
feat_names = remove_from_list(feat_names, ['AbdTenderDegree_Mild', 'AbdTenderDegree_Moderate', 'AbdTenderDegree_Severe'])
# whether to combine AbdomenPain and AbdDistention
if collapse_abd_distention:
feat_names = remove_from_list(feat_names, ['AbdomenPain_yes', 'AbdDistention_yes'])
else:
feat_names = remove_from_list(feat_names, ['AbdDistention_or_AbdomenPain_yes'])
if collapse_age:
feat_names = remove_from_list(feat_names, ['Age'])
else:
feat_names = remove_from_list(feat_names, ['Age<2_yes'])
return sorted(feat_names)
fewest_feats = [
# 'AbdDistention_yes',
'AbdTenderDegree_None',
'AbdTrauma_or_SeatBeltSign_yes',
# 'AbdomenPain_yes',
# 'Age',
'CostalTender_yes',
'DecrBreathSound_yes',
'GCSScore_Full_yes',
'MOI_Fall from an elevation',
'MOI_Motor vehicle collision',
'MOI_Motorcycle/ATV/Scooter collision',
# 'MOI_Pedestrian/bicyclist struck by moving vehicle',
'ThoracicTrauma_yes',
'VomitWretch_yes']
def add_cv_split(df: pd.DataFrame, dset='pecarn'):
# set up train / test
np.random.seed(1)
if dset == 'pecarn':
offset = 0
elif dset == 'psrc':
offset = 7
df['cv_fold'] = np.random.randint(1, 7, size=df.shape[0]) + offset
return df
def get_feat_names(df):
'''Get feature names for pecarn
Original PECARN feats
---------------------
Originally used features: age < 2, severe mechanism of injury (includes many things),
vomiting, hypotension, GCS
thoracic tenderness, evidence of thoracic wall trauma
costal marign tenderness, decreased breath sounds, abdominal distention
complaints of abdominal pain, abdominal tenderness (3 levels)
evidence of abdominal wall trauma or seat belt sign
distracting patinful injury
femur fracture
Returns
-------
feat_names: List[Str]
All valid feature names
pecarn_feats: List[Str]
All valid feature names corresponding to original pecarn iai study
'''
feat_names = [k for k in df.keys() # features to use
if not k in ['id', 'cv_fold']
and not 'iai' in k.lower()]
PECARN_FEAT_NAMES = ['AbdDistention',
'AbdTenderDegree',
'AbdTrauma',
'AbdTrauma_or_SeatBeltSign',
'AbdomenPain',
'Costal',
'DecrBreathSound',
'DistractingPain',
'FemurFracture',
'GCSScore',
'Hypotension',
'LtCostalTender',
'MOI',
'RtCostalTender',
'SeatBeltSign',
'ThoracicTender',
'ThoracicTrauma',
'VomitWretch',
'Age',
'Sex'] + \
['Race', 'InitHeartRate', 'InitSysBPRange'] # new ones to consider
pecarn_feats = set()
for pecarn_feat in PECARN_FEAT_NAMES:
for feat_name in feat_names:
if pecarn_feat in feat_name:
pecarn_feats.add(feat_name)
pecarn_feats = sorted(list(pecarn_feats))
return feat_names, set(pecarn_feats)
def get_sample_weights(df, df_pecarn, df_psrc, balancing_ratio):
'''Get sample weights which also account for age / gender
'''
# class weights
class_weights = {0: 1, 1: balancing_ratio}
sample_weights_class = pd.Series(df[outcome_def]).map(class_weights).values
# weights for different risk populations
age_discrete = pd.cut(df['Age'], bins=(-1, 4, 9, 1000), labels=['<5', '5-9', '>9']).values
# we don't have sex for psrc, so just fill in 0 (only matters for training anyway)
sex = pd.Series(np.hstack((df_pecarn['Sex_M'].values, np.zeros(df_psrc.shape[0])))).map({0: 'F', 1: 'M'}).values
risk_identity = [(sex[i], age_discrete[i]) for i in range(age_discrete.shape[0])]
risk_weights = {
('F', '<5'): 33.9, ('F', '5-9'): 25.8, ('F', '>9'): 27.2,
('M', '<5'): 14.8, ('M', '5-9'): 13.7, ('M', '>9'): 13.1
}
sample_weights_identity = pd.Series(risk_identity).map(risk_weights).values
sample_weights = sample_weights_class * sample_weights_identity # elementwise multiply
return sample_weights
Functions
def add_cv_split(df, dset='pecarn')
-
Expand source code
def add_cv_split(df: pd.DataFrame, dset='pecarn'): # set up train / test np.random.seed(1) if dset == 'pecarn': offset = 0 elif dset == 'psrc': offset = 7 df['cv_fold'] = np.random.randint(1, 7, size=df.shape[0]) + offset return df
def derived_feats(df)
-
Add derived features
Expand source code
def derived_feats(df): '''Add derived features ''' binary = { 0: 'no', 1: 'yes', False: 'no', True: 'yes', 'unknown': 'unknown' } df['AbdTrauma_or_SeatBeltSign'] = ((df.AbdTrauma == 'yes') | (df.SeatBeltSign == 'yes')).map(binary) df['AbdDistention_or_AbdomenPain'] = ((df.AbdDistention == 'AbdomenPain') | (df.SeatBeltSign == 'yes')).map(binary) df['Hypotension'] = (df['Age'] < 1 / 12) & (df['InitSysBPRange'] < 70) | \ (df['Age'] >= 1 / 12) & (df['Age'] < 5) & (df['InitSysBPRange'] < 80) | \ (df['Age'] >= 5) & (df['InitSysBPRange'] < 90) df['Hypotension'] = df['Hypotension'].map(binary) df['GCSScore_Full'] = (df['GCSScore'] == 15).map(binary) df['Age<2'] = (df['Age'] < 2).map(binary) df['CostalTender'] = ((df.LtCostalTender == 1) | (df.RtCostalTender == 1)).map(binary) # | (df.DecrBreathSound) # Combine hispanic as part of race df['Race'] = df['Race_orig'] df.loc[df.Hispanic == 'yes', 'Race'] = 'Hispanic' df.loc[df.Race == 'White', 'Race'] = 'White (Non-Hispanic)' return df
def get_feat_names(df)
-
Get feature names for pecarn
Original PECARN feats
- Originally used features: age < 2, severe mechanism of injury (includes many things),
vomiting
,hypotension
,GCS
thoracic tenderness, evidence of thoracic wall trauma costal marign tenderness, decreased breath sounds, abdominal distention complaints of abdominal pain, abdominal tenderness (3 levels) evidence of abdominal wall trauma or seat belt sign distracting patinful injury femur fracture
Returns
feat_names
:List
[Str
]- All valid feature names
pecarn_feats
:List
[Str
]- All valid feature names corresponding to original pecarn iai study
Expand source code
def get_feat_names(df): '''Get feature names for pecarn Original PECARN feats --------------------- Originally used features: age < 2, severe mechanism of injury (includes many things), vomiting, hypotension, GCS thoracic tenderness, evidence of thoracic wall trauma costal marign tenderness, decreased breath sounds, abdominal distention complaints of abdominal pain, abdominal tenderness (3 levels) evidence of abdominal wall trauma or seat belt sign distracting patinful injury femur fracture Returns ------- feat_names: List[Str] All valid feature names pecarn_feats: List[Str] All valid feature names corresponding to original pecarn iai study ''' feat_names = [k for k in df.keys() # features to use if not k in ['id', 'cv_fold'] and not 'iai' in k.lower()] PECARN_FEAT_NAMES = ['AbdDistention', 'AbdTenderDegree', 'AbdTrauma', 'AbdTrauma_or_SeatBeltSign', 'AbdomenPain', 'Costal', 'DecrBreathSound', 'DistractingPain', 'FemurFracture', 'GCSScore', 'Hypotension', 'LtCostalTender', 'MOI', 'RtCostalTender', 'SeatBeltSign', 'ThoracicTender', 'ThoracicTrauma', 'VomitWretch', 'Age', 'Sex'] + \ ['Race', 'InitHeartRate', 'InitSysBPRange'] # new ones to consider pecarn_feats = set() for pecarn_feat in PECARN_FEAT_NAMES: for feat_name in feat_names: if pecarn_feat in feat_name: pecarn_feats.add(feat_name) pecarn_feats = sorted(list(pecarn_feats)) return feat_names, set(pecarn_feats)
def get_sample_weights(df, df_pecarn, df_psrc, balancing_ratio)
-
Get sample weights which also account for age / gender
Expand source code
def get_sample_weights(df, df_pecarn, df_psrc, balancing_ratio): '''Get sample weights which also account for age / gender ''' # class weights class_weights = {0: 1, 1: balancing_ratio} sample_weights_class = pd.Series(df[outcome_def]).map(class_weights).values # weights for different risk populations age_discrete = pd.cut(df['Age'], bins=(-1, 4, 9, 1000), labels=['<5', '5-9', '>9']).values # we don't have sex for psrc, so just fill in 0 (only matters for training anyway) sex = pd.Series(np.hstack((df_pecarn['Sex_M'].values, np.zeros(df_psrc.shape[0])))).map({0: 'F', 1: 'M'}).values risk_identity = [(sex[i], age_discrete[i]) for i in range(age_discrete.shape[0])] risk_weights = { ('F', '<5'): 33.9, ('F', '5-9'): 25.8, ('F', '>9'): 27.2, ('M', '<5'): 14.8, ('M', '5-9'): 13.7, ('M', '>9'): 13.1 } sample_weights_identity = pd.Series(risk_identity).map(risk_weights).values sample_weights = sample_weights_class * sample_weights_identity # elementwise multiply return sample_weights
def load_it_all(dummy=True, impute=True, frac_missing_allowed=0.1)
-
Expand source code
def load_it_all(dummy=True, impute=True, frac_missing_allowed=0.1): df_pecarn = data_pecarn.get_data(use_processed=False, frac_missing_allowed=frac_missing_allowed, dummy=dummy, impute_feats=impute) all_feats_pecarn, filtered_feats_pecarn = get_feat_names(df_pecarn) try: df_psrc = data_psrc.get_data(use_processed=False, dummy=dummy, impute_feats=impute) all_feats_psrc, filtered_feats_psrc = get_feat_names(df_psrc) common_feats = meta + list(filtered_feats_pecarn.intersection(filtered_feats_psrc)) except: print('PSRC data not loaded (not public)') df_psrc = df_pecarn[df_pecarn.cv_fold > 100] # select 0 rows filtered_feats_psrc = None common_feats = ['AbdDistention_or_AbdomenPain_yes', 'AbdTenderDegree_None', 'AbdTrauma_or_SeatBeltSign_yes', 'Age<2_yes', 'CostalTender_yes', 'DecrBreathSound_yes', 'GCSScore_Full_yes', 'Hypotension_yes', 'MOI_Bike collision/fall', 'MOI_Fall from an elevation', 'MOI_Motor vehicle collision', 'MOI_Motorcycle/ATV/Scooter collision', 'MOI_Object struck abdomen', 'MOI_Pedestrian/bicyclist struck by moving vehicle', 'ThoracicTrauma_yes', 'VomitWretch_yes'] + meta feats_binary = [feat for feat in common_feats if not feat in feats_numerical + feats_categorical + meta] return df_pecarn, df_psrc, common_feats, filtered_feats_pecarn, filtered_feats_psrc
def remove_from_list(l, removes)
-
deletes all elements in removes from the list l and returns
Expand source code
def remove_from_list(l, removes): '''deletes all elements in removes from the list l and returns ''' return [x for x in l if not x in removes]
def select_final_feats(feat_names, collapse_abd_tender=True, collapse_abd_distention=True, collapse_age=True)
-
Return an interpretable set of the best features
Expand source code
def select_final_feats(feat_names, collapse_abd_tender=True, collapse_abd_distention=True, collapse_age=True): '''Return an interpretable set of the best features ''' feat_names = [f for f in feat_names if not f in meta and not f.endswith('_no') and not 'Race' in f and not 'other' in f.lower() and not 'unknown' in f.lower() ] feat_names = remove_from_list(feat_names, ['LtCostalTender', 'RtCostalTender']) feat_names = remove_from_list(feat_names, ['AbdTrauma_yes', 'SeatBeltSign_yes']) feat_names = remove_from_list(feat_names, ['GCSScore']) feat_names = remove_from_list(feat_names, ['InitHeartRate', 'InitSysBPRange']) # remove these so we can only have binary vars # make abd tender into a None or not-None variable if collapse_abd_tender: feat_names = remove_from_list(feat_names, ['AbdTenderDegree_Mild', 'AbdTenderDegree_Moderate', 'AbdTenderDegree_Severe']) # whether to combine AbdomenPain and AbdDistention if collapse_abd_distention: feat_names = remove_from_list(feat_names, ['AbdomenPain_yes', 'AbdDistention_yes']) else: feat_names = remove_from_list(feat_names, ['AbdDistention_or_AbdomenPain_yes']) if collapse_age: feat_names = remove_from_list(feat_names, ['Age']) else: feat_names = remove_from_list(feat_names, ['Age<2_yes']) return sorted(feat_names)
def to_dummies(df)
-
Prepare the data for classification
Expand source code
def to_dummies(df: pd.DataFrame): """Prepare the data for classification """ # convert feats to dummy df = pd.get_dummies(df, dummy_na=True) # treat na as a separate category # remove any col that is all 0s df = df.loc[:, (df != 0).any(axis=0)] return df