Commit dd07b896 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

prediction saving added and some restructuring

parent 7a498249
Make sure these directories are present
`mkdir datasets models plots_and_graphs predicted_datasets word_embeddings`
to add pretrained word embedding models, write in terminal: to add pretrained word embedding models, write in terminal:
`cd word_embeddings` `cd word_embeddings`
...@@ -8,5 +11,15 @@ to add pretrained word embedding models, write in terminal: ...@@ -8,5 +11,15 @@ to add pretrained word embedding models, write in terminal:
to get debiased version of pretrained word2Vec embedding, use the link: <br /> https://drive.google.com/file/d/1_PvT4ZvtZjhq4HPywA8-u06epht9ccOw/view?usp=sharing to get debiased version of pretrained word2Vec embedding, use the link: <br /> https://drive.google.com/file/d/1_PvT4ZvtZjhq4HPywA8-u06epht9ccOw/view?usp=sharing
install install requirements.txt
gensim, pymongo, dnspython
config.py - constants
evaluation.py - contains evaluation metrics
model.py - contains model descriptions and training and prediction modules
preprocessing.py - contains embeddings and preprocessing tasks
sampling.py - contains data extraction and sampling tasks
train.py - contains the runnable flow of the project
run train.py
\ No newline at end of file
...@@ -76,12 +76,25 @@ DATASET_NAMES = { ...@@ -76,12 +76,25 @@ DATASET_NAMES = {
} }
PREDICTED_DATASET = {
# Naming convention : pred_[sv|rf|nn]_[cv|wv|tv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
('svm','w2v','trial','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_wv_tri_bal_0.2_r',
('svm','w2v','trial','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_wv_tri_bal_0.2_b',
('svm','self_w2v','trial','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_tv_tri_bal_0.2_r',
('svm','self_w2v','trial','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_tv_tri_bal_0.2_b',
('svm','w2v','medical','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_wv_med_bal_0.2_r',
('svm','w2v','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_wv_med_bal_0.2_b',
}
PLOT_NAMES = { PLOT_NAMES = {
# Naming convention : plot_[tgp|aod]_[M|F]_[sv|rf|nn]_[cv|wv|tv]_[tri|med]_[ran|bal]_[test_spit]_[b|r] # Naming convention : plot_[tgp|aod]_[M|F]_[sv|rf|nn]_[cv|wv|tv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
('tgp','svm','self_w2v','trial','balanced',0.2,'bio') : 'plots_and_graphs/plot_tgp_sv_tv_tri_bal_0.2_b', ('tgp','svm','self_w2v','trial','balanced',0.2,'bio') : 'plots_and_graphs/plot_tgp_sv_tv_tri_bal_0.2_b',
('aod','svm','self_w2v','trial','balanced',0.2,'bio') : 'plots_and_graphs/plot_aod_sv_tv_tri_bal_0.2_b', ('aod','svm','self_w2v','trial','balanced',0.2,'bio') : 'plots_and_graphs/plot_aod_sv_tv_tri_bal_0.2_b'
} }
\ No newline at end of file
...@@ -12,8 +12,8 @@ def model_training(X_train, Y_train, model, embedding, class_group, sampling, te ...@@ -12,8 +12,8 @@ def model_training(X_train, Y_train, model, embedding, class_group, sampling, te
elif model == 'nn': elif model == 'nn':
trained_model = nn_train(X_train, Y_train) trained_model = nn_train(X_train, Y_train)
dump(trained_model, DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]) print("\t saving file :",DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]])
dump(trained_model, DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
def model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking): def model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking):
......
distlib==0.3.1 distlib==0.3.1
pymongo==3.11.4 pymongo==3.11.4
virtualenv==20.4.2 virtualenv==20.4.2
gensim==4.0.1
pymongo
dnspython
\ No newline at end of file
...@@ -12,8 +12,8 @@ def get_data_from_mongo(class_group): ...@@ -12,8 +12,8 @@ def get_data_from_mongo(class_group):
data = list(collection.find({'$or':[{'title':title} for title in CLASS_GROUP[class_group]]})) data = list(collection.find({'$or':[{'title':title} for title in CLASS_GROUP[class_group]]}))
dump(data, DATASET_NAMES['datasets',class_group]+'.joblib')
print("\t saving file : ",DATASET_NAMES['datasets',class_group]) print("\t saving file : ",DATASET_NAMES['datasets',class_group])
dump(data, DATASET_NAMES['datasets',class_group]+'.joblib')
return(data) return(data)
...@@ -39,8 +39,8 @@ def data_selection(data, class_group, sampling, test_size, masking=True): ...@@ -39,8 +39,8 @@ def data_selection(data, class_group, sampling, test_size, masking=True):
train,test = train_test_split(data, test_size = test_size, random_state = SEED) train,test = train_test_split(data, test_size = test_size, random_state = SEED)
dump([train,test], DATASET_NAMES['datasets',class_group,sampling,test_size]+'.joblib')
print("\t saving file : ",DATASET_NAMES['datasets',class_group,sampling,test_size]) print("\t saving file : ",DATASET_NAMES['datasets',class_group,sampling,test_size])
dump([train,test], DATASET_NAMES['datasets',class_group,sampling,test_size]+'.joblib')
return train,test return train,test
......
from config import CLASS_GROUP, TITLE, MASKED, SEED, DATASET_NAMES from config import CLASS_GROUP, TITLE, MASKED, SEED, DATASET_NAMES, PREDICTED_DATASET
from preprocessing import embedding_fit_transform, embedding_transform from preprocessing import embedding_fit_transform, embedding_transform
from sampling import data_selection, load_data from sampling import data_selection, load_data
from model import model_training, model_prediction from model import model_training, model_prediction
from evaluation import tpr_gender_gap, average_odds_difference from evaluation import tpr_gender_gap, average_odds_difference
import pandas as pd import pandas as pd
from joblib import dump, load
def main(load_data_from_saved, embedding_train, model_train, predict, evaluate, class_group, sampling, embedding, model, test_size, masking): def main(load_data_from_saved, embedding_train, model_train, predict, evaluate, class_group, sampling, embedding, model, test_size, masking):
...@@ -28,19 +30,35 @@ def main(load_data_from_saved, embedding_train, model_train, predict, evaluate, ...@@ -28,19 +30,35 @@ def main(load_data_from_saved, embedding_train, model_train, predict, evaluate,
X_test = embedding_transform(test_set[MASKED[masking]], embedding, class_group, sampling, test_size, masking) X_test = embedding_transform(test_set[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_test = test_set[TITLE] Y_test = test_set[TITLE]
pred, acc = model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking) pred, acc = model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking)
predicted_dataset = pd.DataFrame({MASKED[masking]:test_set[MASKED[masking]], TITLE:test_set[TITLE], 'predicted':pred})
print("\t saving file :",PREDICTED_DATASET[model, embedding, class_group, sampling, test_size, MASKED[masking]])
dump(predicted_dataset, PREDICTED_DATASET[model, embedding, class_group, sampling, test_size, MASKED[masking]] + '.joblib')
print("\nModel accuracy:", acc) print("\nModel accuracy:", acc)
# evaluation # evaluation
# TO-DO:
# combine the prediction in evaluation to the saved prediction
if evaluate: if evaluate:
scores,x_males,y_males,x_females,y_females = tpr_gender_gap(None,test_set, model, embedding, class_group, sampling, test_size, masking) scores,x_males,y_males,x_females,y_females = tpr_gender_gap(None,test_set, model, embedding, class_group, sampling, test_size, masking)
scores,x_males,y_males,x_females,y_females = average_odds_difference(scores,test_set, model, embedding, class_group, sampling, test_size, masking) scores,x_males,y_males,x_females,y_females = average_odds_difference(scores,test_set, model, embedding, class_group, sampling, test_size, masking)
print(scores) print(scores)
''' '''
sampling -> random, balanced(upsample, downsample, weighted) load_data_from_saved -> True if saved data to be used and False if new data to be taken
embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trained), self_w2v:w2v(self-trained) embedding_train -> True to train new embedding and False to use the saved one
model -> svm, rf, nn model_train -> True to train new model and False to use the saved one
predict -> True to perform predictions on the test set and False otherwise
evaluate -> True to perform bias evaluations on the test set and False otherwise
class_group -> choice of domain of occupations ('trial','medical')
sampling -> choice of sampling ('random', 'balanced')
embedding -> choice of embeddings to be used ('cv': count_vectorize(self-trained), 'w2v': word2vec_embedding(pre-trained), 'self_w2v':w2v(self-trained))
model -> choice of models to be trained ('svm', 'rf', 'nn')
test_size -> proportion of data to be used for tesing (IntegerValue)
masking -> True for 'bio' data and False for 'raw' data
''' '''
if __name__ == "__main__": if __name__ == "__main__":
import time import time
...@@ -50,10 +68,10 @@ if __name__ == "__main__": ...@@ -50,10 +68,10 @@ if __name__ == "__main__":
embedding_train = False, embedding_train = False,
model_train = False, model_train = False,
predict = True, predict = True,
evaluate = True, evaluate = False,
class_group = 'trial', class_group = 'trial',
sampling = 'balanced', sampling = 'balanced',
embedding = 'self_w2v', embedding = ''self_w2v'',
model = 'svm', model = 'svm',
test_size = 0.2, test_size = 0.2,
masking = True) masking = True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment