Commit 8cbc6be0 authored by Sparsh Jauhari's avatar Sparsh Jauhari 💬
Browse files

Added debiased w2v(on raw and bio) and debiased self trained w2v(only on raw)

parent a21bfe23
......@@ -20,7 +20,7 @@ SEED = 414325
WORD2VEC_PATH = "word_embeddings/GoogleNews-vectors-negative300.bin"
DEBIASED_WORD2VEC_PATH = "word_embeddings/GoogleNews-vectors-negative300-hard-debiased.bin"
DEBIASED_SELF_WORD2VEC_PATH = "word_embeddings/Self_trained_word2vec_debiased.bin"
MASKED = { True:'bio',
False:'raw'
......@@ -92,8 +92,11 @@ DATASET_NAMES = {
('model','svm','self_w2v','medical','balanced',0.2,'bio') : 'models/modl_sv_tv_med_bal_0.2_b',
('model','svm','self_w2v','medical','balanced',0.2,'raw') : 'models/modl_sv_tv_med_bal_0.2_r',
('model','svm','elmo','medical','balanced',0.2,'bio') : 'models/modl_sv_ev_med_bal_0.2_b',
('model','svm','elmo','medical','balanced',0.2,'raw') : 'models/modl_sv_ev_med_bal_0.2_r'
('model','svm','elmo','medical','balanced',0.2,'raw') : 'models/modl_sv_ev_med_bal_0.2_r',
('model','svm','d_self_w2v','medical','balanced',0.2,'raw') : 'models/modl_sv_dtv_med_bal_0.2_r',
('model','svm','d_self_w2v','medical','balanced',0.2,'bio') : 'models/modl_sv_dtv_med_bal_0.2_b',
('model','svm','d_w2v','medical','balanced',0.2,'bio') : 'models/modl_sv_dwv_med_bal_0.2_b',
('model','svm','d_w2v','medical','balanced',0.2,'raw') : 'models/modl_sv_dwv_med_bal_0.2_r',
}
PREDICTED_DATASET = {
......@@ -125,6 +128,10 @@ PREDICTED_DATASET = {
('svm','w2v','medical','random',0.2,'bio') : 'predicted_datasets/pred_sv_wv_med_ran_0.2_b',
('svm','self_w2v','medical','random',0.2,'raw') : 'predicted_datasets/pred_sv_tv_med_ran_0.2_r',
('svm','self_w2v','medical','random',0.2,'bio') : 'predicted_datasets/pred_sv_tv_med_ran_0.2_b',
('svm','d_w2v','medical','random',0.2,'raw') : 'predicted_datasets/pred_sv_dwv_med_ran_0.2_r',
('svm','d_w2v','medical','random',0.2,'bio') : 'predicted_datasets/pred_sv_dwv_med_ran_0.2_b',
('svm','d_self_w2v','medical','random',0.2,'raw') : 'predicted_datasets/pred_sv_dtv_med_ran_0.2_r',
('svm','d_self_w2v','medical','random',0.2,'bio') : 'predicted_datasets/pred_sv_dtv_med_ran_0.2_b',
('svm','elmo','medical','random',0.2,'raw') : 'predicted_datasets/pred_sv_ev_med_ran_0.2_r',
('svm','elmo','medical','random',0.2,'bio') : 'predicted_datasets/pred_sv_ev_med_ran_0.2_b',
......@@ -132,8 +139,12 @@ PREDICTED_DATASET = {
('svm','cv','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_cv_med_bal_0.2_b',
('svm','w2v','medical','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_wv_med_bal_0.2_r',
('svm','w2v','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_wv_med_bal_0.2_b',
('svm','d_w2v','medical','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_dwv_med_bal_0.2_r',
('svm','d_w2v','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_dwv_med_bal_0.2_b',
('svm','self_w2v','medical','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_tv_med_bal_0.2_r',
('svm','self_w2v','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_tv_med_bal_0.2_b',
('svm','d_self_w2v','medical','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_dtv_med_bal_0.2_r',
('svm','d_self_w2v','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_dtv_med_bal_0.2_b',
('svm','elmo','medical','balanced',0.2,'raw') : 'predicted_datasets/pred_sv_ev_med_bal_0.2_r',
('svm','elmo','medical','balanced',0.2,'bio') : 'predicted_datasets/pred_sv_ev_med_bal_0.2_b'
......
......@@ -4,7 +4,7 @@ import tensorflow as tf
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from config import WORD2VEC_PATH, DATASET_NAMES, MASKED
from config import WORD2VEC_PATH, DEBIASED_WORD2VEC_PATH, DEBIASED_SELF_WORD2VEC_PATH, DATASET_NAMES, MASKED
# from gensim.models import Word2Vec
from gensim.models import KeyedVectors, Word2Vec
from joblib import dump, load
......@@ -56,6 +56,12 @@ def embedding_transform(x_list, embedding, class_group, sampling, test_size, mas
elif embedding == 'w2v':
X = pretrained_word2vec_transform(x_list,masking)
elif embedding == 'd_w2v':
X = pretrained_debiased_word2vec_transform(x_list,masking)
elif embedding == 'd_self_w2v':
X = selftrained_debiased_word2vec_transform(x_list,masking)
elif embedding == 'elmo':
X = elmo_transform(x_list,masking)
......@@ -94,7 +100,7 @@ def count_vectorize_fit_transform(x_list,masking):
vectorizer_binary = CountVectorizer(lowercase=True, preprocessor=None, binary=True, stop_words=None, tokenizer=preProcessAndTokenize)
X = vectorizer_binary.fit_transform(x_list)
print(" count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
print("count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
return vectorizer_binary, X
......@@ -112,6 +118,17 @@ def word2vec_transform(model, x_list,masking):
X = np.array([np.mean([model[word] for word in filter(lambda x: x in model,preProcessAndTokenize(sent))],axis=0) for sent in x_list])
return X
def pretrained_debiased_word2vec_transform(x_list,masking):
model = KeyedVectors.load_word2vec_format(DEBIASED_WORD2VEC_PATH, binary = True)
X = word2vec_transform(model, x_list,masking)
return X
def selftrained_debiased_word2vec_transform(x_list,masking):
model = KeyedVectors.load_word2vec_format(DEBIASED_SELF_WORD2VEC_PATH, binary = True)
X = word2vec_transform(model, x_list,masking)
return X
def pretrained_word2vec_transform(x_list,masking):
# pre trained
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment