Commit 23963d9e authored by Nishtha Jain's avatar Nishtha Jain
Browse files

code refinement

parent 527b0adc
This diff is collapsed.
......@@ -120,3 +120,34 @@ def average_odds_difference(scores, test, model, embedding, class_group, samplin
plt.show()
return (scores,x_males,y_males,x_females,y_females)
def average_odds_error(scores, test, model, embedding, class_group, sampling, test_size, masking):
print("processing evaluation.average_odds_error ...")
if not scores:
scores = tpr_fpr(test, model, embedding, class_group, sampling, test_size, masking)
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_males = [abs((scores['fpr_males'][i]-scores['fpr_females'][i]) + abs(scores['tpr_males'][i]-scores['tpr_females'][i]))/2 for i in range(len(CLASS_GROUP[class_group]))]
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_females = [abs((scores['fpr_females'][i]-scores['fpr_males'][i]) + abs(scores['tpr_females'][i]-scores['tpr_males'][i]))/2 for i in range(len(CLASS_GROUP[class_group]))]
fig, (axs_males,axs_females) = plt.subplots(1, 2, figsize=(20, 8))
axs_males.scatter(x_males, y_males)
axs_males.set_xlabel("% Male")
axs_males.set_ylabel("Avg odds error Male")
for i, txt in enumerate(CLASS_GROUP[class_group]):
axs_males.annotate(txt, (x_males[i], y_males[i]))
axs_females.scatter(x_females, y_females)
axs_females.set_xlabel("% Female")
axs_females.set_ylabel("Avg odds error Male")
for i, txt in enumerate(CLASS_GROUP[class_group]):
axs_females.annotate(txt, (x_females[i], y_females[i]))
plt.savefig(PLOT_NAMES['aoe',model,embedding,class_group,sampling,test_size,MASKED[masking]] + '.png')
plt.show()
return (scores,x_males,y_males,x_females,y_females)
......@@ -108,7 +108,6 @@ def nn_train(X_train, Y_train):
# check again everything !!!!!!
from keras import Sequential
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
......
......@@ -63,11 +63,9 @@ def embedding_transform(x_list, embedding, class_group, sampling, test_size, mas
''' Add this with every embedding
-> RegexpTokenizer
-> nltk english stop_words
-> SnowballStemmer
'''
-> SnowballStemmer'''
def preProcessAndTokenize(sentence):
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokenized_words=tokenizer.tokenize(sentence.lower())
......
......@@ -2,7 +2,7 @@ from config import CLASS_GROUP, TITLE, MASKED, SEED, DATASET_NAMES, PREDICTED_DA
from preprocessing import embedding_fit_transform, embedding_transform
from sampling import data_selection, load_data
from model import model_training, model_prediction
from evaluation import tpr_gender_gap, average_odds_difference
from evaluation import tpr_gender_gap, tpr_fpr
import pandas as pd
from joblib import dump, load
......@@ -48,8 +48,9 @@ def main(load_data_from_saved, embedding_train, model_train, predict, evaluate,
# combine the prediction in evaluation to the saved prediction
if evaluate:
scores,x_males,y_males,x_females,y_females = tpr_gender_gap(None,test_set, model, embedding, class_group, sampling, test_size, masking)
scores,x_males,y_males,x_females,y_females = average_odds_difference(scores,test_set, model, embedding, class_group, sampling, test_size, masking)
scores = tpr_fpr(test_set, model, embedding, class_group, sampling, test_size, masking)
# scores,x_males,y_males,x_females,y_females = tpr_gender_gap(None,test_set, model, embedding, class_group, sampling, test_size, masking)
print("\t saving file :",EVALUATION_SCORES[model, embedding, class_group, sampling, test_size, MASKED[masking]])
dump(scores,EVALUATION_SCORES[model, embedding, class_group, sampling, test_size, MASKED[masking]] + '.joblib')
print(scores)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment