Skip to content
Snippets Groups Projects
Commit b5cdd1b4 authored by Kateryna Nikulina's avatar Kateryna Nikulina
Browse files

del _diamonds in files name

parent b20054b0
No related branches found
No related tags found
No related merge requests found
...@@ -59,7 +59,7 @@ for combo in combos: ...@@ -59,7 +59,7 @@ for combo in combos:
df_loss.loc[idx, 'end_eps'] = end_eps df_loss.loc[idx, 'end_eps'] = end_eps
df_loss.loc[idx, 'end_sample'] = end_sample df_loss.loc[idx, 'end_sample'] = end_sample
idx += 1 idx += 1
df_loss.to_csv(f'./end_eps_for_{df_names[0]}_in_pair_w_{df_names[1]}_full_diamonds.csv', index=False) df_loss.to_csv(f'./end_eps_for_{df_names[0]}_in_pair_w_{df_names[1]}_full.csv', index=False)
# second dataset # second dataset
...@@ -74,7 +74,7 @@ for combo in combos: ...@@ -74,7 +74,7 @@ for combo in combos:
df_loss.loc[idx, 'end_eps'] = end_eps df_loss.loc[idx, 'end_eps'] = end_eps
df_loss.loc[idx, 'end_sample'] = end_sample df_loss.loc[idx, 'end_sample'] = end_sample
idx += 1 idx += 1
df_loss.to_csv(f'./end_eps_for_{df_names[1]}_in_pair_w_{df_names[0]}_full_diamonds.csv', index=False) df_loss.to_csv(f'./end_eps_for_{df_names[1]}_in_pair_w_{df_names[0]}_full.csv', index=False)
# computation of intersection values with all points (outliers included) without sampling of instances # computation of intersection values with all points (outliers included) without sampling of instances
logging.info(f'BEFORE DBSCAN, {df_names[1]} is fixed') logging.info(f'BEFORE DBSCAN, {df_names[1]} is fixed')
......
...@@ -62,7 +62,7 @@ for combo in combos: ...@@ -62,7 +62,7 @@ for combo in combos:
df_loss.loc[idx, 'end_eps'] = end_eps df_loss.loc[idx, 'end_eps'] = end_eps
df_loss.loc[idx, 'end_sample'] = end_sample df_loss.loc[idx, 'end_sample'] = end_sample
idx += 1 idx += 1
df_loss.to_csv(f'./end_eps_for_{df_names[0]}_in_pair_w_{df_names[1]}_full_diamonds.csv', index=False) df_loss.to_csv(f'./end_eps_for_{df_names[0]}_in_pair_w_{df_names[1]}_full.csv', index=False)
# second dataset # second dataset
...@@ -77,7 +77,7 @@ for combo in combos: ...@@ -77,7 +77,7 @@ for combo in combos:
df_loss.loc[idx, 'end_eps'] = end_eps df_loss.loc[idx, 'end_eps'] = end_eps
df_loss.loc[idx, 'end_sample'] = end_sample df_loss.loc[idx, 'end_sample'] = end_sample
idx += 1 idx += 1
df_loss.to_csv(f'./end_eps_for_{df_names[1]}_in_pair_w_{df_names[0]}_full_diamonds.csv', index=False) df_loss.to_csv(f'./end_eps_for_{df_names[1]}_in_pair_w_{df_names[0]}_full.csv', index=False)
### the second dataframe is considered at its full size, the size of the first dataset is altered ### the second dataframe is considered at its full size, the size of the first dataset is altered
# computation of intersection values with all points (outliers included) and sampling of instances # computation of intersection values with all points (outliers included) and sampling of instances
...@@ -87,7 +87,7 @@ for combo in combos: ...@@ -87,7 +87,7 @@ for combo in combos:
df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names, df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names,
bools_1 = [], bools_2 = [], first_fixed = False, size = size) bools_1 = [], bools_2 = [], first_fixed = False, size = size)
df_1_in_2 = pd.concat([df_1_in_2, df_curr]) df_1_in_2 = pd.concat([df_1_in_2, df_curr])
df_1_in_2.to_csv(f"./sampled_{df_names[0]}_and_full_{df_names[1]}_all_points_diamonds.csv", index=False) df_1_in_2.to_csv(f"./sampled_{df_names[0]}_and_full_{df_names[1]}_all_points.csv", index=False)
# computation of intersection values after outlier removal and sampling of instances # computation of intersection values after outlier removal and sampling of instances
logging.info(f'AFTER DBSCAN, {df_names[1]} is fixed') logging.info(f'AFTER DBSCAN, {df_names[1]} is fixed')
...@@ -96,7 +96,7 @@ for combo, bool_1, bool_2 in zip(combos, result_bools_1, result_bools_2): ...@@ -96,7 +96,7 @@ for combo, bool_1, bool_2 in zip(combos, result_bools_1, result_bools_2):
df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names, df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names,
bools_1 = bool_1, bools_2 = bool_2, first_fixed = False, size = size) bools_1 = bool_1, bools_2 = bool_2, first_fixed = False, size = size)
df_1_in_2 = pd.concat([df_1_in_2, df_curr]) df_1_in_2 = pd.concat([df_1_in_2, df_curr])
df_1_in_2.to_csv(f"./full_{df_names[0]}_and_full_{df_names[1]}_wo_outliers_diamonds.csv", index=False) df_1_in_2.to_csv(f"./full_{df_names[0]}_and_full_{df_names[1]}_wo_outliers.csv", index=False)
### symmetric case: the first dataframe is considered at its full size, the size of the second dataset is altered ### symmetric case: the first dataframe is considered at its full size, the size of the second dataset is altered
# computation of intersection values with all points (outliers included) and sampling of instances # computation of intersection values with all points (outliers included) and sampling of instances
...@@ -106,7 +106,7 @@ for combo in combos: ...@@ -106,7 +106,7 @@ for combo in combos:
df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names, df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names,
bools_1 = [], bools_2 = [], first_fixed = True, size = size) bools_1 = [], bools_2 = [], first_fixed = True, size = size)
df_1_in_2 = pd.concat([df_1_in_2, df_curr]) df_1_in_2 = pd.concat([df_1_in_2, df_curr])
df_1_in_2.to_csv(f"./sampled_{df_names[1]}_and_full_{df_names[0]}_all_points_diamonds.csv", index=False) df_1_in_2.to_csv(f"./sampled_{df_names[1]}_and_full_{df_names[0]}_all_points.csv", index=False)
# computation of intersection values after outlier removal and sampling of instances # computation of intersection values after outlier removal and sampling of instances
logging.info(f'AFTER DBSCAN, {df_names[0]} is fixed') logging.info(f'AFTER DBSCAN, {df_names[0]} is fixed')
...@@ -115,4 +115,4 @@ for combo, bool_1, bool_2 in zip(combos, result_bools_1, result_bools_2): ...@@ -115,4 +115,4 @@ for combo, bool_1, bool_2 in zip(combos, result_bools_1, result_bools_2):
df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names, df_curr = compute_intersections(combo = combo, df_1 = df_1, df_2 = df_2, df_names = df_names,
bools_1 = bool_1, bools_2 = bool_2, first_fixed = True, size = size) bools_1 = bool_1, bools_2 = bool_2, first_fixed = True, size = size)
df_1_in_2 = pd.concat([df_1_in_2, df_curr]) df_1_in_2 = pd.concat([df_1_in_2, df_curr])
df_1_in_2.to_csv(f"./sampled_{df_names[1]}_and_full_{df_names[0]}_wo_outliers_diamonds.csv", index=False) df_1_in_2.to_csv(f"./sampled_{df_names[1]}_and_full_{df_names[0]}_wo_outliers.csv", index=False)
...@@ -19,7 +19,7 @@ path_to_ch_outputs = f'{path_to_tables}/full_*' #FIXME: fix to the path where th ...@@ -19,7 +19,7 @@ path_to_ch_outputs = f'{path_to_tables}/full_*' #FIXME: fix to the path where th
# intersection values the combination of features with the smallest intersection values are found. Condition for the search is # intersection values the combination of features with the smallest intersection values are found. Condition for the search is
# the lower fence, values that are smaller are considered as those that may include important features (i.e. those that # the lower fence, values that are smaller are considered as those that may include important features (i.e. those that
# discriminate between the sets). If the lower fence is negative, no such combinations are found # discriminate between the sets). If the lower fence is negative, no such combinations are found
df = pd.read_csv(f'{path_to_table}/full_{df_pairs[0]}_and_full_{df_pairs[1]}_wo_outliers_diamonds.csv') #FIXME: fix to the path where the table is stored df = pd.read_csv(f'{path_to_table}/full_{df_pairs[0]}_and_full_{df_pairs[1]}_wo_outliers.csv') #FIXME: fix to the path where the table is stored
df_mean = df.groupby(['Name', 'Parameters']).mean() df_mean = df.groupby(['Name', 'Parameters']).mean()
df_mean = df_mean.reset_index() df_mean = df_mean.reset_index()
...@@ -37,7 +37,7 @@ enrich_df = pd.concat([enrich_1, enrich_2]).reset_index(drop = True) ...@@ -37,7 +37,7 @@ enrich_df = pd.concat([enrich_1, enrich_2]).reset_index(drop = True)
if not enrich_df.empty: if not enrich_df.empty:
# if the table is not empty it contains the list of important features, their count in 'bad' combinations (those with low # if the table is not empty it contains the list of important features, their count in 'bad' combinations (those with low
# intersection values) and 'good' combinations (those that are higher than the lower fence), odds and p-values # intersection values) and 'good' combinations (those that are higher than the lower fence), odds and p-values
enrich_df.to_csv(f'./important_parameters_{df_pairs[0]}_and_{df_pairs[1]}_diamonds.csv', index=False) enrich_df.to_csv(f'./important_parameters_{df_pairs[0]}_and_{df_pairs[1]}.csv', index=False)
else: else:
logging.info('Bad parameters not found') logging.info('Bad parameters not found')
......
...@@ -25,4 +25,4 @@ df_2['df_name'] = 'second_df' ...@@ -25,4 +25,4 @@ df_2['df_name'] = 'second_df'
# df_name: name of a dataframe (in target variable) to be set at 1 in the classification problem # df_name: name of a dataframe (in target variable) to be set at 1 in the classification problem
# corr_coef: float, coefficient of acceptable correlation between features. If more, then one of the correlated features will be dropped # corr_coef: float, coefficient of acceptable correlation between features. If more, then one of the correlated features will be dropped
result = classify(df_1 = df_1, df_2 = df_2, target = 'df_name', df_name = 'first_df', corr_coef = 0.9) result = classify(df_1 = df_1, df_2 = df_2, target = 'df_name', df_name = 'first_df', corr_coef = 0.9)
result.to_csv('./Classification_diamonds.csv', index=False) result.to_csv('./Classification.csv', index=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment