Commit 456db499 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

svm-w2v and self_w2v added

parent 1d39aba7
......@@ -25,32 +25,52 @@ MASKED = { True:'bio',
}
DATASET_NAMES = {
# Naming convention : [dset|embd|modl]_[sv|rf|nn]_[cv|wv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
# trial domain
('datasets','trial') : 'datasets/trial_dataset',
('datasets','trial','random',0.2) : 'datasets/trial_set_1',
('datasets','trial') : 'datasets/dset___tri___',
('datasets','trial','random',0.2) : 'datasets/dset___tri_ran_0.2_',
('datasets','trial','balanced',0.2) : 'datasets/dset___tri_bal_0.2_',
('embedding','cv','trial','random',0.2,'bio') : 'word_embeddings/embd__cv_tri_ran_0.2_b',
('embedding','cv','trial','random',0.2,'raw') : 'word_embeddings/embd__cv_tri_ran_0.2_r',
('embedding','w2v','trial','random',0.2,'bio') : 'word_embeddings/embd__wv_tri_ran_0.2_b',
('embedding','w2v','trial','random',0.2,'raw') : 'word_embeddings/embd__wv_tri_ran_0.2_r',
('embedding','cv','trial','balanced',0.2,'bio') : 'word_embeddings/embd__cv_tri_bal_0.2_b',
('embedding','w2v','trial','balanced',0.2,'bio') : 'word_embeddings/embd__wv_tri_bal_0.2_b',
('embedding','self_w2v','trial','balanced',0.2,'bio') : 'word_embeddings/embd__tv_tri_bal_0.2_b',
('embedding','cv','trial','random',0.2,'bio') : 'word_embeddings/trial_embd_1',
('embedding','cv','trial','random',0.2,'raw') : 'word_embeddings/trial_embd_2',
('embedding','w2v','trial','random',0.2,'bio') : 'word_embeddings/trial_embd_3',
('embedding','w2v','trial','random',0.2,'raw') : 'word_embeddings/trial_embd_4',
('model','svm','cv','trial','random',0.2,'bio') : 'models/modl_sv_cv_tri_ran_0.2_b',
('model','svm','cv','trial','random',0.2,'raw') : 'models/modl_sv_cv_tri_ran_0.2_r',
('model','svm','w2v','trial','random',0.2,'bio') : 'models/modl_sv_wv_tri_ran_0.2_b',
('model','svm','w2v','trial','random',0.2,'raw') : 'models/modl_sv_wv_tri_ran_0.2_r',
('model','svm','cv','trial','balanced',0.2,'bio') : 'models/modl_sv_cv_tri_bal_0.2_b',
('model','svm','w2v','trial','balanced',0.2,'bio') : 'models/modl_sv_wv_tri_bal_0.2_b',
('model','svm','self_w2v','trial','balanced',0.2,'bio') : 'models/modl_sv_tv_tri_bal_0.2_b',
('model','svm','cv','trial','random',0.2,'bio') : 'models/trial_model_1',
('model','svm','cv','trial','random',0.2,'raw') : 'models/trial_model_2',
('model','svm','w2v','trial','random',0.2,'bio') : 'models/trial_model_3',
('model','svm','w2v','trial','random',0.2,'raw') : 'models/trial_model_4',
# medical domain
('datasets','medical') : 'datasets/medical_dataset',
('datasets','medical','random',0.2) : 'datasets/set_1',
('embedding','cv','medical','random',0.2,'bio') : 'word_embeddings/embd_1',
('embedding','cv','medical','random',0.2,'raw') : 'word_embeddings/embd_2',
('embedding','w2v','medical','random',0.2,'bio') : 'word_embeddings/embd_3',
('embedding','w2v','medical','random',0.2,'raw') : 'word_embeddings/embd_4',
('model','svm','cv','medical','random',0.2,'bio') : 'models/model_1',
('model','svm','cv','medical','random',0.2,'raw') : 'models/model_2',
('model','svm','w2v','medical','random',0.2,'bio') : 'models/model_3',
('model','svm','w2v','medical','random',0.2,'raw') : 'models/model_4'
('datasets','medical') : 'datasets/dset___med___',
('datasets','medical','random',0.2) : 'datasets/dset___med_ran_0.2_',
('datasets','medical','balanced',0.2) : 'datasets/dset___med_bal_0.2_',
('embedding','cv','medical','random',0.2,'bio') : 'word_embeddings/embd__cv_med_ran_0.2_b',
('embedding','cv','medical','random',0.2,'raw') : 'word_embeddings/embd__cv_med_ran_0.2_r',
('embedding','w2v','medical','random',0.2,'bio') : 'word_embeddings/embd__wv_med_ran_0.2_b',
('embedding','w2v','medical','random',0.2,'raw') : 'word_embeddings/embd__wv_med_ran_0.2_r',
('embedding','cv','medical','balanced',0.2,'bio') : 'word_embeddings/embd__cv_med_bal_0.2_b',
('embedding','cv','medical','balanced',0.2,'raw') : 'word_embeddings/embd__cv_med_bal_0.2_r',
('embedding','w2v','medical','balanced',0.2,'bio') : 'word_embeddings/embd__wv_med_bal_0.2_b',
('embedding','w2v','medical','balanced',0.2,'raw') : 'word_embeddings/embd__wv_med_bal_0.2_r',
('model','svm','cv','medical','random',0.2,'bio') : 'models/modl_sv_cv_med_ran_0.2_b',
('model','svm','cv','medical','random',0.2,'raw') : 'models/modl_sv_cv_med_ran_0.2_r',
('model','svm','w2v','medical','random',0.2,'bio') : 'models/modl_sv_wv_med_ran_0.2_b',
('model','svm','w2v','medical','random',0.2,'raw') : 'models/modl_sv_wv_med_ran_0.2_r',
('model','svm','cv','medical','balanced',0.2,'bio') : 'models/modl_sv_cv_med_bal_0.2_b',
('model','svm','cv','medical','balanced',0.2,'raw') : 'models/modl_sv_cv_med_bal_0.2_r',
('model','svm','w2v','medical','balanced',0.2,'bio') : 'models/modl_sv_wv_med_bal_0.2_b',
('model','svm','w2v','medical','balanced',0.2,'raw') : 'models/modl_sv_wv_med_bal_0.2_r'
}
\ No newline at end of file
from sklearn.svm import SVC
from config import DATASET_NAMES,MASKED
from config import DATASET_NAMES, MASKED, SEED
from joblib import dump, load
import numpy as np
def model_training(X_train, Y_train, model, embedding, class_group, sampling, test_size, masking):
print("processing model.model_training ...")
if model == 'svm':
trained_model = svm_train(X_train, Y_train)
trained_model = svm_train(X_train, Y_train, sampling)
elif model == 'rf':
trained_model = rf_train(X_train, Y_train)
elif model == 'nn':
trained_model = nn_train(X_train, Y_train)
dump(trained_model, DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]])
def model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking):
......@@ -34,10 +35,16 @@ def load_model(model, embedding, class_group, sampling, test_size, masking):
def svm_train(X_train, Y_train):
classifier = SVC(C=1, kernel = 'linear', gamma = 'auto')
def svm_train(X_train, Y_train, sampling):
if sampling == 'balanced':
class_weight = 'balanced'
elif sampling == 'random':
class_weight = None
classifier = SVC(C=1, kernel = 'linear', gamma = 'auto', class_weight=class_weight,random_state=SEED)
print("shape of X_train",X_train.shape)
print("first instance of X_train", X_train[0])
print("first instance of X_train", type(X_train[0]),X_train[0])
classifier.fit(X_train, Y_train)
return classifier
......
......@@ -5,7 +5,7 @@ from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from config import WORD2VEC_PATH, DATASET_NAMES, MASKED
# from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import KeyedVectors, Word2Vec
from joblib import dump, load
import numpy as np
......@@ -19,9 +19,13 @@ def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size,
if embedding == 'cv':
trained_embedding, X = count_vectorize_fit_transform(x_list)
elif embedding == 'w2v':
trained_embedding, X = word2vec_fit_transform(x_list)
trained_embedding, X = pretrained_word2vec_fit_transform(x_list)
elif embedding == 'self_w2v':
trained_embedding, X = selftrained_word2vec_fit_transform(x_list)
dump(trained_embedding,DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]])
return(X)
......@@ -32,7 +36,7 @@ def embedding_transform(x_list, embedding, class_group, sampling, test_size, mas
if embedding == 'cv':
X = count_vectorize_transform(trained_embedding,x_list)
elif embedding == 'w2v':
elif embedding == 'w2v'or embedding == 'self_w2v':
X = word2vec_transform(trained_embedding,x_list)
return(X)
......@@ -58,7 +62,7 @@ def preProcessAndTokenize(sentence):
def count_vectorize_fit_transform(x_list):
vectorizer_binary = CountVectorizer(lowercase=True, preprocessor=None, binary=True, stop_words=None, tokenizer=preProcessAndTokenize)
X = vectorizer_binary.fit_transform(x_list)
# print("count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
print(" count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
return vectorizer_binary, X
......@@ -67,20 +71,56 @@ def count_vectorize_transform(vectorizer_binary,x_list):
return X
def word2vec_fit_transform(x_list):
## To-Do
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
self.dim = len(word2vec.itervalues().next())
def fit(self, X, y):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
def word2vec_transform(model, x_list):
X = np.array([np.mean([model[word] for word in filter(lambda x: x in model,preProcessAndTokenize(sent))],axis=0) for sent in x_list])
return X
def pretrained_word2vec_fit_transform(x_list):
# pre trained
# check again
# model = Word2Vec.load(WORD2VEC_PATH)
model = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary = True)
X = np.array([[model[word] for word in filter(lambda x: x in model,preProcessAndTokenize(sent))] for sent in x_list])
X = word2vec_transform(model, x_list)
return model , X
def selftrained_word2vec_fit(corpus):
tokensed_corpus = [preProcessAndTokenize(sent) for sent in corpus]
model = Word2Vec(tokensed_corpus, vector_size = 100).wv
return model
def selftrained_word2vec_fit_transform(x_list):
## To-Do
# change corpus to all possible texts
corpus = x_list
model = selftrained_word2vec_fit(corpus)
X = word2vec_transform(model, x_list)
return model , X
def word2vec_transform(model, x_list):
X = np.array([[model[word] for word in filter(lambda x: x in model,preProcessAndTokenize(sent))] for sent in x_list])
return X
......
......@@ -13,6 +13,7 @@ def get_data_from_mongo(class_group):
data = list(collection.find({'$or':[{'title':title} for title in CLASS_GROUP[class_group]]}))
dump(data, DATASET_NAMES['datasets',class_group]+'.joblib')
print("\t saving file : ",DATASET_NAMES['datasets',class_group])
return(data)
......@@ -26,22 +27,21 @@ def load_data(class_group, from_saved=True):
return data
'''
sampling -> random, upsample, downsample, weighted
sampling -> random, balanced(upsample, downsample, weighted)
'''
def data_selection(data, class_group, sampling, test_size, masking=True):
print('processing sampling.data_selection ...')
if sampling == 'random':
if sampling == 'random' :
train,test = train_test_split(data, test_size = test_size, random_state = SEED)
# same for now
elif sampling == 'balanced':
train,test = train_test_split(data, test_size = test_size, random_state = SEED)
elif sampling == 'downsample':
# TO-DO:
train,test = None,None
else:
# TO-DO:
train,test = None,None
dump([train,test], DATASET_NAMES['datasets',class_group,sampling,test_size]+'.joblib')
print("\t saving file : ",DATASET_NAMES['datasets',class_group,sampling,test_size])
return train,test
......@@ -6,7 +6,7 @@ from evaluation import tpr_gender_gap
import pandas as pd
def main(load_data_from_saved, model_train, embedding_train, predict, evaluate, class_group, sampling, embedding, model, test_size, masking):
def main(load_data_from_saved, embedding_train, model_train, predict, evaluate, class_group, sampling, embedding, model, test_size, masking):
print("\nprocessing train.main ...")
data = pd.DataFrame(load_data(class_group=class_group, from_saved=load_data_from_saved))
......@@ -38,8 +38,8 @@ def main(load_data_from_saved, model_train, embedding_train, predict, evaluate,
'''
sampling -> random, upsample, downsample, weighted
embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trained)
sampling -> random, balanced(upsample, downsample, weighted)
embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trained), self_w2v:w2v(self-trained)
model -> svm, rf, nn
'''
if __name__ == "__main__":
......@@ -47,14 +47,14 @@ if __name__ == "__main__":
start_time = time.time()
main(load_data_from_saved = True,
model_train = True,
embedding_train = True,
model_train = True,
predict = True,
evaluate = True,
evaluate = False,
class_group = 'trial',
sampling = 'random',
embedding = 'cv',
sampling = 'balanced',
embedding = 'self_w2v',
model = 'svm',
test_size = 0.2,
masking = True)
print("--- %s seconds ---" % (time.time() - start_time))
print("\n--- %s seconds ---" % (time.time() - start_time))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment