Commit 7a498249 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

plots and graphs added

parent df560f8e
......@@ -25,7 +25,7 @@ MASKED = { True:'bio',
}
DATASET_NAMES = {
# Naming convention : [dset|embd|modl]_[sv|rf|nn]_[cv|wv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
# Naming convention : [dset|embd|modl]_[sv|rf|nn]_[cv|wv|tv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
# trial domain
('datasets','trial') : 'datasets/dset___tri___',
......@@ -49,6 +49,7 @@ DATASET_NAMES = {
('model','svm','self_w2v','trial','balanced',0.2,'bio') : 'models/modl_sv_tv_tri_bal_0.2_b',
# medical domain
('datasets','medical') : 'datasets/dset___med___',
......@@ -73,4 +74,14 @@ DATASET_NAMES = {
('model','svm','w2v','medical','balanced',0.2,'bio') : 'models/modl_sv_wv_med_bal_0.2_b',
('model','svm','w2v','medical','balanced',0.2,'raw') : 'models/modl_sv_wv_med_bal_0.2_r'
}
PLOT_NAMES = {
# Naming convention : plot_[tgp|aod]_[M|F]_[sv|rf|nn]_[cv|wv|tv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
('tgp','svm','self_w2v','trial','balanced',0.2,'bio') : 'plots_and_graphs/plot_tgp_sv_tv_tri_bal_0.2_b',
('aod','svm','self_w2v','trial','balanced',0.2,'bio') : 'plots_and_graphs/plot_aod_sv_tv_tri_bal_0.2_b',
}
\ No newline at end of file
from config import CLASS_GROUP, MASKED, TITLE
from config import CLASS_GROUP, MASKED, TITLE, PLOT_NAMES
from model import model_prediction
from preprocessing import embedding_transform
import numpy as np
......@@ -28,10 +28,17 @@ def tpr_fpr(test, model, embedding, class_group, sampling, test_size, masking):
'fp':fp, 'count':len(pred),
'fpr':float(fp)/len(pred)}
scores = { 'count_males' = [],
'count_females' = [],
'tpr_males' = [],
'tpr_females' = []
scores = { 'count_males' :[],
'count_females' : [],
'tpr_males' : [],
'tpr_females' : [],
'tp_males' : [],
'tp_females' : [],
'fpr_males' : [],
'fpr_females' : [],
'fp_males' : [],
'fp_females' : []
}
for c in res:
......@@ -59,26 +66,26 @@ def tpr_gender_gap(scores ,test, model, embedding, class_group, sampling, test_s
scores = tpr_fpr(test, model, embedding, class_group, sampling, test_size, masking)
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(7)]
y_males = [scores['tpr_males'][i]-scores['tpr_females'][i] for i in range(7)]
plt.scatter(x_males, y_males)
plt.xlabel("% Male")
plt.ylabel("TPR Gender Gap Male")
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_males = [scores['tpr_males'][i]-scores['tpr_females'][i] for i in range(len(CLASS_GROUP[class_group]))]
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_females = [scores['tpr_females'][i]-scores['tpr_males'][i] for i in range(len(CLASS_GROUP[class_group]))]
fig, (axs_males,axs_females) = plt.subplots(1, 2, figsize=(20, 8))
axs_males.scatter(x_males, y_males)
axs_males.set_xlabel("% Male")
axs_males.set_ylabel("TPR Gender Gap Male")
for i, txt in enumerate(CLASS_GROUP[class_group]):
plt.annotate(txt, (x_males[i], y_males[i]))
plt.show()
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(7)]
y_females = [scores['tpr_females'][i]-scores['tpr_males'][i] for i in range(7)]
plt.scatter(x_females, y_females)
plt.xlabel("% Female")
plt.ylabel("TPR Gender Gap Female")
axs_males.annotate(txt, (x_males[i], y_males[i]))
axs_females.scatter(x_females, y_females)
axs_females.set_xlabel("% Female")
axs_females.set_ylabel("TPR Gender Gap Female")
for i, txt in enumerate(CLASS_GROUP[class_group]):
plt.annotate(txt, (x_females[i], y_females[i]))
axs_females.annotate(txt, (x_females[i], y_females[i]))
plt.savefig(PLOT_NAMES['tgp',model,embedding,class_group,sampling,test_size,MASKED[masking]] + '.png')
plt.show()
return (scores,x_males,y_males,x_females,y_females)
......@@ -90,26 +97,26 @@ def average_odds_difference(scores, test, model, embedding, class_group, samplin
if not scores:
scores = tpr_fpr(test, model, embedding, class_group, sampling, test_size, masking)
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(7)]
y_males = [(scores['fpr_males'][i]-scores['fpr_females'][i] + scores['tpr_males'][i]-scores['tpr_females'][i])/2 for i in range(7)]
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_males = [(scores['fpr_males'][i]-scores['fpr_females'][i] + scores['tpr_males'][i]-scores['tpr_females'][i])/2 for i in range(len(CLASS_GROUP[class_group]))]
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_females = [(scores['fpr_females'][i]-scores['fpr_males'][i] + scores['tpr_females'][i]-scores['tpr_males'][i])/2 for i in range(len(CLASS_GROUP[class_group]))]
plt.scatter(x_males, y_males)
plt.xlabel("% Male")
plt.ylabel("Avg odds difference Male")
fig, (axs_males,axs_females) = plt.subplots(1, 2, figsize=(20, 8))
axs_males.scatter(x_males, y_males)
axs_males.set_xlabel("% Male")
axs_males.set_ylabel("Avg odds difference Male")
for i, txt in enumerate(CLASS_GROUP[class_group]):
plt.annotate(txt, (x_males[i], y_males[i]))
plt.show()
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(7)]
y_females = [(scores['fpr_females'][i]-scores['fpr_males'][i] + scores['tpr_females'][i]-scores['tpr_males'][i])/2 for i in range(7)]
plt.scatter(x_females, y_females)
plt.xlabel("% Female")
plt.ylabel("Avg odds difference Male")
axs_males.annotate(txt, (x_males[i], y_males[i]))
axs_females.scatter(x_females, y_females)
axs_females.set_xlabel("% Female")
axs_females.set_ylabel("Avg odds difference Male")
for i, txt in enumerate(CLASS_GROUP[class_group]):
plt.annotate(txt, (x_females[i], y_females[i]))
axs_females.annotate(txt, (x_females[i], y_females[i]))
plt.savefig(PLOT_NAMES['aod',model,embedding,class_group,sampling,test_size,MASKED[masking]] + '.png')
plt.show()
return (scores,x_males,y_males,x_females,y_females)
......@@ -2,7 +2,7 @@ from config import CLASS_GROUP, TITLE, MASKED, SEED, DATASET_NAMES
from preprocessing import embedding_fit_transform, embedding_transform
from sampling import data_selection, load_data
from model import model_training, model_prediction
from evaluation import tpr_gender_gap
from evaluation import tpr_gender_gap, average_odds_difference
import pandas as pd
......@@ -34,7 +34,7 @@ def main(load_data_from_saved, embedding_train, model_train, predict, evaluate,
if evaluate:
scores,x_males,y_males,x_females,y_females = tpr_gender_gap(None,test_set, model, embedding, class_group, sampling, test_size, masking)
scores,x_males,y_males,x_females,y_females = average_odds_difference(scores,test_set, model, embedding, class_group, sampling, test_size, masking)
print(scores)
'''
......@@ -46,11 +46,11 @@ if __name__ == "__main__":
import time
start_time = time.time()
main(load_data_from_saved = False,
embedding_train = True,
model_train = True,
main(load_data_from_saved = True,
embedding_train = False,
model_train = False,
predict = True,
evaluate = False,
evaluate = True,
class_group = 'trial',
sampling = 'balanced',
embedding = 'self_w2v',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment