Module src.data_psrc
Expand source code
import os
import sys
from os.path import join as oj
from config import PROCESSED_DIR, PSRC_DIR
sys.path.insert(1, oj(sys.path[0], '..')) # insert parent path
import numpy as np
import pandas as pd
import data
def get_data(use_processed=False, processed_file=oj(PROCESSED_DIR, 'df_psrc.pkl'), dummy=False, impute_feats=True):
'''Run all the preprocessing
Params
------
use_processed: bool, optional
determines whether to load df from cached pkl (only for reading from the csv
save_processed: bool, optional
if not using processed, determines whether to save the df
'''
if use_processed and os.path.exists(processed_file):
return pd.read_pickle(processed_file)
else:
data_file = oj(PSRC_DIR, 'psrc_data_processed.csv')
df = pd.read_csv(data_file)
# fix col names
df['id'] = -1 * np.arange(1, df.shape[0] + 1)
df = df.rename(columns=lambda x: x.replace('choice=0ne', 'choice=None').strip())
df = df.replace('0ne', 'None')
# rename values
df = rename_values(df)
df = data.derived_feats(df)
if impute_feats:
df = impute(df)
# drop unnecessary
ks_drop = [k for k in df.keys()
if k in ['Time patient trauma alert concluded/ left trauma bay (military time)',
'Time CT performed',
]]
df = df.drop(columns=ks_drop + ['Record ID', 'Arrival time in ED', 'Age in months', 'Age in years'])
# outcomes
iai_keys = [k for k in df.keys() if 'Interventions for IAI' in k]
iai_with_intervention_keys = [k for k in iai_keys if not 'choice=None' in k]
outcomes = ['Admission',
'ICU admission',
'Length of inpatient stay (days)',
'Delayed inpatient diagnosis of IAI (more than 24 hours after admission)',
'Mortality (within 30 days of injury)',
'Mortality related to trauma',
'Mortality secondary to intra-abdominal injury',
'Missed diagnosis of IAI (following discharge)'
] + iai_keys
# iai
df['iai'] = df[iai_keys].sum(axis=1) > 0
df['iai_intervention'] = df[iai_with_intervention_keys].sum(axis=1) > 0
df = df.infer_objects()
df = data.add_cv_split(df, dset='psrc')
if dummy:
df = data.to_dummies(df)
df['dset'] = 'psrc'
# save
os.makedirs(os.path.dirname(processed_file), exist_ok=True)
df.to_pickle(processed_file)
return df
def rename_values(df):
'''Map values to meanings
Rename some features
Compute a couple new features
set types of
'''
df = df.rename(columns={
'Seatbelt sign': 'SeatBeltSign',
'Initial GCS': 'GCSScore',
'Lower chest wall/costal margin tenderness to palpation (choice=1 on left)': 'LtCostalTender',
'Lower chest wall/costal margin tenderness to palpation (choice=1 on right)': 'RtCostalTender'
})
# fill with median
df['Age'] = df['Age in years'].fillna(0) + df['Age in months'].fillna(0) / 12
df['InitSysBPRange'] = df['Initial ED systolic BP']
df['InitHeartRate'] = df['Initial ED HR']
df['FemurFracture'] = df['Femur fracture'].sum(axis=1)
binary = {
0: 'no',
1: 'yes',
False: 'no',
True: 'yes',
'unknown': 'unknown'
}
df['Race_orig'] = df['Race'].fillna('unknown')
df['Hispanic'] = df['Hispanic ethnicity'].fillna('unknown').map(binary)
df['SeatBeltSign'] = df['SeatBeltSign'].map(binary)
df['AbdDistention'] = df['Abdominal distension'].fillna('unknown').map(binary)
df['VomitWretch'] = df['Emesis post injury'].fillna('unknown').map(binary)
df['AbdTrauma'] = (1 - df['Evidence of abdominal wall trauma (choice=None)']).map(binary)
df['AbdomenPain'] = (df['Complainabd. pain'] != '0').astype(int).map(binary)
df['ThoracicTrauma'] = (1 - df['Evidence of thoracic trauma (choice=None)']).map(binary)
df['DecrBreathSound'] = df['Evidence of thoracic trauma (choice=Decreased breath sounds)'].map(binary)
df['DistractingPain'] = np.array([False] * df.shape[0])
for k in ['Chest X-ray (choice=Rib fracture)',
'Indicate thoracic injury (choice=Clavicle fracture)',
'Chest X-ray (choice=Scapula fracture)',
'FemurFracture', 'Pelvic fracture']:
df['DistractingPain'] = df['DistractingPain'] | df[k]
# df['FemurFracture'] = df['Femur fracture'] #.map(binar)
abdTenderDegree = {
'None': 'None',
'Mild': 'Mild',
'Moderate': 'Moderate',
'Severe': 'Severe',
'Limited exam secondary to intubation/sedation': 'Severe', # probably severe
'unknown': 'None'
}
df['AbdTenderDegree'] = df['Abdominal tenderness to palpation'].fillna('None').map(abdTenderDegree)
moi = {
'Mechanism of injury (choice=Assault/struck)': 'Object struck abdomen',
'Mechanism of injury (choice=ATV injury)': 'Motorcycle/ATV/Scooter collision',
'Mechanism of injury (choice=Bike crash)': 'Bike collision/fall',
'Mechanism of injury (choice=Bike struck by auto)': 'Pedestrian/bicyclist struck by moving vehicle',
'Mechanism of injury (choice=Fall > 10 ft. height)': 'Fall from an elevation',
'Mechanism of injury (choice=Golf cart injury)': 'Motorcycle/ATV/Scooter collision',
'Mechanism of injury (choice=Motorcycle/dirt bike crash)': 'Motorcycle/ATV/Scooter collision',
'Mechanism of injury (choice=MVC)': 'Motor vehicle collision',
'Mechanism of injury (choice=Pedestrian struck by auto)': 'Pedestrian/bicyclist struck by moving vehicle',
'Mechanism of injury (choice=Other blunt mechanism)': 'Object struck abdomen',
}
df['MOI'] = ['unknown'] * df.shape[0]
for k in moi:
df.loc[df[k] == 1, 'MOI'] = moi[k]
df['CTScan'] = df['Abdominal CT scan performed']
return df
def impute(df: pd.DataFrame):
"""Returns df with imputed features
"""
# filling some continuous vars with median
df['GCSScore'] = (df['GCSScore'].fillna(df['GCSScore'].median())).astype(int)
df['InitSysBPRange'] = df['InitSysBPRange'].fillna(df['InitSysBPRange'].median()).astype(int)
df['InitHeartRate'] = df['InitHeartRate'].fillna(df['InitHeartRate'].median())
# other vars get specific imputations
# df['AbdTenderDegree'] = df['AbdTenderDegree'].fillna('None')
df['AbdomenPain'] = df['AbdomenPain'].fillna('other')
return df
Functions
def get_data(use_processed=False, processed_file='../data/processed/df_psrc.pkl', dummy=False, impute_feats=True)
-
Run all the preprocessing
Params
use_processed
:bool
, optional- determines whether to load df from cached pkl (only for reading from the csv
save_processed
:bool
, optional- if not using processed, determines whether to save the df
Expand source code
def get_data(use_processed=False, processed_file=oj(PROCESSED_DIR, 'df_psrc.pkl'), dummy=False, impute_feats=True): '''Run all the preprocessing Params ------ use_processed: bool, optional determines whether to load df from cached pkl (only for reading from the csv save_processed: bool, optional if not using processed, determines whether to save the df ''' if use_processed and os.path.exists(processed_file): return pd.read_pickle(processed_file) else: data_file = oj(PSRC_DIR, 'psrc_data_processed.csv') df = pd.read_csv(data_file) # fix col names df['id'] = -1 * np.arange(1, df.shape[0] + 1) df = df.rename(columns=lambda x: x.replace('choice=0ne', 'choice=None').strip()) df = df.replace('0ne', 'None') # rename values df = rename_values(df) df = data.derived_feats(df) if impute_feats: df = impute(df) # drop unnecessary ks_drop = [k for k in df.keys() if k in ['Time patient trauma alert concluded/ left trauma bay (military time)', 'Time CT performed', ]] df = df.drop(columns=ks_drop + ['Record ID', 'Arrival time in ED', 'Age in months', 'Age in years']) # outcomes iai_keys = [k for k in df.keys() if 'Interventions for IAI' in k] iai_with_intervention_keys = [k for k in iai_keys if not 'choice=None' in k] outcomes = ['Admission', 'ICU admission', 'Length of inpatient stay (days)', 'Delayed inpatient diagnosis of IAI (more than 24 hours after admission)', 'Mortality (within 30 days of injury)', 'Mortality related to trauma', 'Mortality secondary to intra-abdominal injury', 'Missed diagnosis of IAI (following discharge)' ] + iai_keys # iai df['iai'] = df[iai_keys].sum(axis=1) > 0 df['iai_intervention'] = df[iai_with_intervention_keys].sum(axis=1) > 0 df = df.infer_objects() df = data.add_cv_split(df, dset='psrc') if dummy: df = data.to_dummies(df) df['dset'] = 'psrc' # save os.makedirs(os.path.dirname(processed_file), exist_ok=True) df.to_pickle(processed_file) return df
def impute(df)
-
Returns df with imputed features
Expand source code
def impute(df: pd.DataFrame): """Returns df with imputed features """ # filling some continuous vars with median df['GCSScore'] = (df['GCSScore'].fillna(df['GCSScore'].median())).astype(int) df['InitSysBPRange'] = df['InitSysBPRange'].fillna(df['InitSysBPRange'].median()).astype(int) df['InitHeartRate'] = df['InitHeartRate'].fillna(df['InitHeartRate'].median()) # other vars get specific imputations # df['AbdTenderDegree'] = df['AbdTenderDegree'].fillna('None') df['AbdomenPain'] = df['AbdomenPain'].fillna('other') return df
def rename_values(df)
-
Map values to meanings Rename some features Compute a couple new features set types of
Expand source code
def rename_values(df): '''Map values to meanings Rename some features Compute a couple new features set types of ''' df = df.rename(columns={ 'Seatbelt sign': 'SeatBeltSign', 'Initial GCS': 'GCSScore', 'Lower chest wall/costal margin tenderness to palpation (choice=1 on left)': 'LtCostalTender', 'Lower chest wall/costal margin tenderness to palpation (choice=1 on right)': 'RtCostalTender' }) # fill with median df['Age'] = df['Age in years'].fillna(0) + df['Age in months'].fillna(0) / 12 df['InitSysBPRange'] = df['Initial ED systolic BP'] df['InitHeartRate'] = df['Initial ED HR'] df['FemurFracture'] = df['Femur fracture'].sum(axis=1) binary = { 0: 'no', 1: 'yes', False: 'no', True: 'yes', 'unknown': 'unknown' } df['Race_orig'] = df['Race'].fillna('unknown') df['Hispanic'] = df['Hispanic ethnicity'].fillna('unknown').map(binary) df['SeatBeltSign'] = df['SeatBeltSign'].map(binary) df['AbdDistention'] = df['Abdominal distension'].fillna('unknown').map(binary) df['VomitWretch'] = df['Emesis post injury'].fillna('unknown').map(binary) df['AbdTrauma'] = (1 - df['Evidence of abdominal wall trauma (choice=None)']).map(binary) df['AbdomenPain'] = (df['Complainabd. pain'] != '0').astype(int).map(binary) df['ThoracicTrauma'] = (1 - df['Evidence of thoracic trauma (choice=None)']).map(binary) df['DecrBreathSound'] = df['Evidence of thoracic trauma (choice=Decreased breath sounds)'].map(binary) df['DistractingPain'] = np.array([False] * df.shape[0]) for k in ['Chest X-ray (choice=Rib fracture)', 'Indicate thoracic injury (choice=Clavicle fracture)', 'Chest X-ray (choice=Scapula fracture)', 'FemurFracture', 'Pelvic fracture']: df['DistractingPain'] = df['DistractingPain'] | df[k] # df['FemurFracture'] = df['Femur fracture'] #.map(binar) abdTenderDegree = { 'None': 'None', 'Mild': 'Mild', 'Moderate': 'Moderate', 'Severe': 'Severe', 'Limited exam secondary to intubation/sedation': 'Severe', # probably severe 'unknown': 'None' } df['AbdTenderDegree'] = df['Abdominal tenderness to palpation'].fillna('None').map(abdTenderDegree) moi = { 'Mechanism of injury (choice=Assault/struck)': 'Object struck abdomen', 'Mechanism of injury (choice=ATV injury)': 'Motorcycle/ATV/Scooter collision', 'Mechanism of injury (choice=Bike crash)': 'Bike collision/fall', 'Mechanism of injury (choice=Bike struck by auto)': 'Pedestrian/bicyclist struck by moving vehicle', 'Mechanism of injury (choice=Fall > 10 ft. height)': 'Fall from an elevation', 'Mechanism of injury (choice=Golf cart injury)': 'Motorcycle/ATV/Scooter collision', 'Mechanism of injury (choice=Motorcycle/dirt bike crash)': 'Motorcycle/ATV/Scooter collision', 'Mechanism of injury (choice=MVC)': 'Motor vehicle collision', 'Mechanism of injury (choice=Pedestrian struck by auto)': 'Pedestrian/bicyclist struck by moving vehicle', 'Mechanism of injury (choice=Other blunt mechanism)': 'Object struck abdomen', } df['MOI'] = ['unknown'] * df.shape[0] for k in moi: df.loc[df[k] == 1, 'MOI'] = moi[k] df['CTScan'] = df['Abdominal CT scan performed'] return df