Module imodelsx.process_results
Expand source code
import argparse
import sys
import os.path
from os.path import dirname, join
from os.path import join
from tqdm import tqdm
import pandas as pd
import pickle as pkl
import sys
import warnings
import scipy.stats
import numpy as np
import joblib
import json
repo_dir = dirname(dirname(os.path.abspath(__file__)))
def get_results_df(results_dir, use_cached=False, results_fname='results.pkl', save_pickle=False) -> pd.DataFrame:
"""Load results from a directory of experiments,
each experiments is a row in the dataframe
"""
fname = join(results_dir, "results_aggregated.pkl")
if use_cached and os.path.exists(fname):
return pd.read_pickle(fname)
dir_names = sorted(
[
fname
for fname in os.listdir(results_dir)
if os.path.isdir(join(results_dir, fname))
and os.path.exists(join(results_dir, fname, results_fname))
]
)
results_list = []
for dir_name in tqdm(dir_names):
try:
if results_fname.endswith(".pkl") or results_fname.endswith(".pickle") or results_fname.endswith(".joblib"):
result = joblib.load(
join(results_dir, dir_name, results_fname))
elif results_fname.endswith(".json"):
result = json.load(
open(join(results_dir, dir_name, results_fname), "r"))
ser = pd.Series(result)
results_list.append(ser)
except:
print(
f'Error loading {join(results_dir, dir_name, results_fname)}')
r = pd.concat(results_list, axis=1).T.infer_objects()
if save_pickle:
r.to_pickle(fname)
return r
def get_main_args_list(experiment_filename="01_train_model.py"):
"""Returns main arguments from the argparser used by an experiments script
Params
------
experiment_filename: str
Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
"""
if experiment_filename.endswith(".py"):
experiment_filename = experiment_filename[:-3]
# sys.path.append(join(repo_dir, 'experiments'))
sys.path.append(os.path.dirname(experiment_filename))
train_script = __import__(os.path.basename(experiment_filename))
args = train_script.add_main_args(argparse.ArgumentParser()).parse_args([])
return list(vars(args).keys())
def fill_missing_args_with_default(df, experiment_filename="01_train_model.py"):
"""Returns main arguments from the argparser used by an experiments script
Params
------
experiment_filename: str
Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
"""
if experiment_filename.endswith(".py"):
experiment_filename = experiment_filename[:-3]
sys.path.append(os.path.dirname(experiment_filename))
train_script = __import__(os.path.basename(experiment_filename))
parser = train_script.add_main_args(argparse.ArgumentParser())
parser = train_script.add_computational_args(parser)
args = parser.parse_args([])
args_dict = vars(args)
for k, v in args_dict.items():
if k not in df.columns:
df[k] = v
else:
if v is None:
df[k] = df[k].fillna(np.nan)
else:
df[k] = df[k].fillna(v)
return df
def delete_runs_in_dataframe(
df: pd.DataFrame, actually_delete=False,
directory_key="save_dir_unique",
# save_dir_prefix_replace=('/mntv1', '/home/chansingh/mntv1')
):
"""Deletes stored results for all runs in the dataframe r."""
if not actually_delete:
print(
f"Not actually deleting {df.shape[0]} directories. Set actually_delete=True to actually delete the directories."
)
return
num_deleted = 0
for i in tqdm(range(df.shape[0])):
save_dir = df.iloc[i][directory_key]
# if save_dir_prefix_replace is not None:
# if save_dir.startswith(save_dir_prefix_replace[0]):
# save_dir = save_dir.replace(
# save_dir_prefix_replace[0], save_dir_prefix_replace[1]
# )
try:
os.system(f"rm -rf {save_dir}")
num_deleted += 1
except:
pass
print(f"Deleted {num_deleted}/{df.shape[0]} directories.")
def average_over_seeds(
df: pd.DataFrame,
experiment_filename="01_train_model.py",
key_to_average_over="seed",
):
"""Returns values averaged over seed.
Standard errors of the mean are added with columns suffixed with _err
For example, 'accuracy_test' yields two columns
'accuracy_test' now holds the mean value
'accuracy_test_err' now holds the standard error of the mean
Params
------
experiment_filename: str
Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
This is used to get the names of the arguments to aggregate over
"""
def sem(x):
"""Compute standard error of the mean, ignoring NaNs"""
with warnings.catch_warnings():
return scipy.stats.sem(x, ddof=0)
group_keys = [
k
for k in get_main_args_list(experiment_filename)
if not k == key_to_average_over and k in df.columns
]
numeric_keys = [k for k in list(
df.select_dtypes("number")) if not k in group_keys]
df_avg = (
df.groupby(by=group_keys)[numeric_keys].aggregate(
['mean', sem]).reset_index()
)
df_avg.columns = [x[0] + "_err" if x[1] == "sem" else x[0]
for x in df_avg.columns]
return df_avg
def remove_columns_with_static_values(df: pd.DataFrame):
"""Removes columns that have the same value for all rows"""
return df.loc[:, df.nunique() > 1]
def get_experiment_keys(df, experiment_filename, exclude_seed=False):
experiment_keys = [
k
for k in get_main_args_list(experiment_filename=experiment_filename)
if k in df.columns and len(df[k].unique()) > 1
]
if exclude_seed:
experiment_keys = [k for k in experiment_keys if k != "seed"]
return experiment_keys
Functions
def average_over_seeds(df: pandas.core.frame.DataFrame, experiment_filename='01_train_model.py', key_to_average_over='seed')
-
Returns values averaged over seed. Standard errors of the mean are added with columns suffixed with _err For example, 'accuracy_test' yields two columns 'accuracy_test' now holds the mean value 'accuracy_test_err' now holds the standard error of the mean
Params
experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py This is used to get the names of the arguments to aggregate over
Expand source code
def average_over_seeds( df: pd.DataFrame, experiment_filename="01_train_model.py", key_to_average_over="seed", ): """Returns values averaged over seed. Standard errors of the mean are added with columns suffixed with _err For example, 'accuracy_test' yields two columns 'accuracy_test' now holds the mean value 'accuracy_test_err' now holds the standard error of the mean Params ------ experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py This is used to get the names of the arguments to aggregate over """ def sem(x): """Compute standard error of the mean, ignoring NaNs""" with warnings.catch_warnings(): return scipy.stats.sem(x, ddof=0) group_keys = [ k for k in get_main_args_list(experiment_filename) if not k == key_to_average_over and k in df.columns ] numeric_keys = [k for k in list( df.select_dtypes("number")) if not k in group_keys] df_avg = ( df.groupby(by=group_keys)[numeric_keys].aggregate( ['mean', sem]).reset_index() ) df_avg.columns = [x[0] + "_err" if x[1] == "sem" else x[0] for x in df_avg.columns] return df_avg
def delete_runs_in_dataframe(df: pandas.core.frame.DataFrame, actually_delete=False, directory_key='save_dir_unique')
-
Deletes stored results for all runs in the dataframe r.
Expand source code
def delete_runs_in_dataframe( df: pd.DataFrame, actually_delete=False, directory_key="save_dir_unique", # save_dir_prefix_replace=('/mntv1', '/home/chansingh/mntv1') ): """Deletes stored results for all runs in the dataframe r.""" if not actually_delete: print( f"Not actually deleting {df.shape[0]} directories. Set actually_delete=True to actually delete the directories." ) return num_deleted = 0 for i in tqdm(range(df.shape[0])): save_dir = df.iloc[i][directory_key] # if save_dir_prefix_replace is not None: # if save_dir.startswith(save_dir_prefix_replace[0]): # save_dir = save_dir.replace( # save_dir_prefix_replace[0], save_dir_prefix_replace[1] # ) try: os.system(f"rm -rf {save_dir}") num_deleted += 1 except: pass print(f"Deleted {num_deleted}/{df.shape[0]} directories.")
def fill_missing_args_with_default(df, experiment_filename='01_train_model.py')
-
Returns main arguments from the argparser used by an experiments script Params
experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
Expand source code
def fill_missing_args_with_default(df, experiment_filename="01_train_model.py"): """Returns main arguments from the argparser used by an experiments script Params ------ experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py """ if experiment_filename.endswith(".py"): experiment_filename = experiment_filename[:-3] sys.path.append(os.path.dirname(experiment_filename)) train_script = __import__(os.path.basename(experiment_filename)) parser = train_script.add_main_args(argparse.ArgumentParser()) parser = train_script.add_computational_args(parser) args = parser.parse_args([]) args_dict = vars(args) for k, v in args_dict.items(): if k not in df.columns: df[k] = v else: if v is None: df[k] = df[k].fillna(np.nan) else: df[k] = df[k].fillna(v) return df
def get_experiment_keys(df, experiment_filename, exclude_seed=False)
-
Expand source code
def get_experiment_keys(df, experiment_filename, exclude_seed=False): experiment_keys = [ k for k in get_main_args_list(experiment_filename=experiment_filename) if k in df.columns and len(df[k].unique()) > 1 ] if exclude_seed: experiment_keys = [k for k in experiment_keys if k != "seed"] return experiment_keys
def get_main_args_list(experiment_filename='01_train_model.py')
-
Returns main arguments from the argparser used by an experiments script
Params
experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
Expand source code
def get_main_args_list(experiment_filename="01_train_model.py"): """Returns main arguments from the argparser used by an experiments script Params ------ experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py """ if experiment_filename.endswith(".py"): experiment_filename = experiment_filename[:-3] # sys.path.append(join(repo_dir, 'experiments')) sys.path.append(os.path.dirname(experiment_filename)) train_script = __import__(os.path.basename(experiment_filename)) args = train_script.add_main_args(argparse.ArgumentParser()).parse_args([]) return list(vars(args).keys())
def get_results_df(results_dir, use_cached=False, results_fname='results.pkl', save_pickle=False) ‑> pandas.core.frame.DataFrame
-
Load results from a directory of experiments, each experiments is a row in the dataframe
Expand source code
def get_results_df(results_dir, use_cached=False, results_fname='results.pkl', save_pickle=False) -> pd.DataFrame: """Load results from a directory of experiments, each experiments is a row in the dataframe """ fname = join(results_dir, "results_aggregated.pkl") if use_cached and os.path.exists(fname): return pd.read_pickle(fname) dir_names = sorted( [ fname for fname in os.listdir(results_dir) if os.path.isdir(join(results_dir, fname)) and os.path.exists(join(results_dir, fname, results_fname)) ] ) results_list = [] for dir_name in tqdm(dir_names): try: if results_fname.endswith(".pkl") or results_fname.endswith(".pickle") or results_fname.endswith(".joblib"): result = joblib.load( join(results_dir, dir_name, results_fname)) elif results_fname.endswith(".json"): result = json.load( open(join(results_dir, dir_name, results_fname), "r")) ser = pd.Series(result) results_list.append(ser) except: print( f'Error loading {join(results_dir, dir_name, results_fname)}') r = pd.concat(results_list, axis=1).T.infer_objects() if save_pickle: r.to_pickle(fname) return r
def remove_columns_with_static_values(df: pandas.core.frame.DataFrame)
-
Removes columns that have the same value for all rows
Expand source code
def remove_columns_with_static_values(df: pd.DataFrame): """Removes columns that have the same value for all rows""" return df.loc[:, df.nunique() > 1]