Module imodelsx.process_results
Functions
def average_over_seeds(df: pandas.core.frame.DataFrame,
experiment_filename='01_train_model.py',
key_to_average_over='seed')-
Expand source code
def average_over_seeds( df: pd.DataFrame, experiment_filename="01_train_model.py", key_to_average_over="seed", ): """Returns values averaged over seed. Standard errors of the mean are added with columns suffixed with _err For example, 'accuracy_test' yields two columns 'accuracy_test' now holds the mean value 'accuracy_test_err' now holds the standard error of the mean Params ------ experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py This is used to get the names of the arguments to aggregate over """ def sem(x): """Compute standard error of the mean, ignoring NaNs""" with warnings.catch_warnings(): # Ignore only UserWarning warnings.simplefilter("ignore") return scipy.stats.sem(x, ddof=0) group_keys = [ k for k in get_main_args_list(experiment_filename) if not k == key_to_average_over and k in df.columns ] numeric_keys = [k for k in list( df.select_dtypes("number")) if not k in group_keys] df_avg = ( df.groupby(by=group_keys)[numeric_keys].aggregate( ['mean', sem]).reset_index() ) df_avg.columns = [x[0] + "_err" if x[1] == "sem" else x[0] for x in df_avg.columns] return df_avg
Returns values averaged over seed. Standard errors of the mean are added with columns suffixed with _err For example, 'accuracy_test' yields two columns 'accuracy_test' now holds the mean value 'accuracy_test_err' now holds the standard error of the mean
Params
experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py This is used to get the names of the arguments to aggregate over
def delete_runs_in_dataframe(df: pandas.core.frame.DataFrame,
actually_delete=False,
directory_key='save_dir_unique')-
Expand source code
def delete_runs_in_dataframe( df: pd.DataFrame, actually_delete=False, directory_key="save_dir_unique", # save_dir_prefix_replace=('/mntv1', '/home/chansingh/mntv1') ): """Deletes stored results for all runs in the dataframe r.""" if not actually_delete: print( f"Not actually deleting {df.shape[0]} directories. Set actually_delete=True to actually delete the directories." ) return num_deleted = 0 for i in tqdm(range(df.shape[0])): save_dir = df.iloc[i][directory_key] # if save_dir_prefix_replace is not None: # if save_dir.startswith(save_dir_prefix_replace[0]): # save_dir = save_dir.replace( # save_dir_prefix_replace[0], save_dir_prefix_replace[1] # ) try: os.system(f"rm -rf {save_dir}") num_deleted += 1 except: pass print(f"Deleted {num_deleted}/{df.shape[0]} directories.")
Deletes stored results for all runs in the dataframe r.
def fill_missing_args_with_default(df, experiment_filename='01_train_model.py')
-
Expand source code
def fill_missing_args_with_default(df, experiment_filename="01_train_model.py"): """Returns main arguments from the argparser used by an experiments script Params ------ experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py """ if experiment_filename.endswith(".py"): experiment_filename = experiment_filename[:-3] # this is kind of weird -- assumes the script is in the same level as the notebook that calls this function sys.path.append(dirname(experiment_filename)) # or the nb can be one level below sys.path.append('../' + dirname(experiment_filename)) train_script = __import__(basename(experiment_filename)) parser = train_script.add_main_args(argparse.ArgumentParser()) parser = train_script.add_computational_args(parser) args = parser.parse_args([]) args_dict = vars(args) for k, v in args_dict.items(): if k not in df.columns: df[k] = v else: if v is None: df[k] = df[k].fillna(np.nan) else: df[k] = df[k].fillna(v) return df
Returns main arguments from the argparser used by an experiments script Params
experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
def get_experiment_keys(df, experiment_filename, exclude_seed=False)
-
Expand source code
def get_experiment_keys(df, experiment_filename, exclude_seed=False): experiment_keys = [ k for k in get_main_args_list(experiment_filename=experiment_filename) if k in df.columns and len(df[k].unique()) > 1 ] if exclude_seed: experiment_keys = [k for k in experiment_keys if k != "seed"] return experiment_keys
def get_main_args_list(experiment_filename='01_train_model.py')
-
Expand source code
def get_main_args_list(experiment_filename="01_train_model.py"): """Returns main arguments from the argparser used by an experiments script Params ------ experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py """ if experiment_filename.endswith(".py"): experiment_filename = experiment_filename[:-3] # sys.path.append(join(repo_dir, 'experiments')) sys.path.append(os.path.dirname(experiment_filename)) train_script = __import__(os.path.basename(experiment_filename)) args = train_script.add_main_args(argparse.ArgumentParser()).parse_args([]) return list(vars(args).keys())
Returns main arguments from the argparser used by an experiments script
Params
experiment_filename: str Full path + name of the experiments script, e.g. /home/user/tree-prompt/experiments/01_train_model.py
def get_results_df(results_dir, use_cached=False, results_fname='results.pkl', save_pickle=False) ‑> pandas.core.frame.DataFrame
-
Expand source code
def get_results_df(results_dir, use_cached=False, results_fname='results.pkl', save_pickle=False) -> pd.DataFrame: """Load results from a directory of experiments, each experiments is a row in the dataframe """ fname = join(results_dir, "results_aggregated.pkl") if use_cached and os.path.exists(fname): return pd.read_pickle(fname) dir_names = sorted( [ fname for fname in os.listdir(results_dir) if os.path.isdir(join(results_dir, fname)) and os.path.exists(join(results_dir, fname, results_fname)) ] ) results_list = [] for dir_name in tqdm(dir_names): try: if results_fname.endswith(".pkl") or results_fname.endswith(".pickle") or results_fname.endswith(".joblib"): result = joblib.load( join(results_dir, dir_name, results_fname)) elif results_fname.endswith(".json"): result = json.load( open(join(results_dir, dir_name, results_fname), "r")) if isinstance(result, list): sers = [pd.Series(r) for r in result] results_list.extend(sers) else: ser = pd.Series(result) results_list.append(ser) except: print( f'Error loading {join(results_dir, dir_name, results_fname)}') r = pd.concat(results_list, axis=1).T.infer_objects() if save_pickle: r.to_pickle(fname) return r
Load results from a directory of experiments, each experiments is a row in the dataframe
def remove_columns_with_static_values(df: pandas.core.frame.DataFrame)
-
Expand source code
def remove_columns_with_static_values(df: pd.DataFrame): """Removes columns that have the same value for all rows""" return df.loc[:, df.nunique() > 1]
Removes columns that have the same value for all rows