Commit 69039e22 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

elmo no save and self_w2v on full bio and raw

parent 1994fb64
......@@ -26,19 +26,21 @@ MASKED = { True:'bio',
DATASET_NAMES = {
# Naming convention : [dset|embd|modl]_[sv|rf|nn]_[cv|wv|tv|ev]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
('embedding','self_w2v','bio') : 'word_embeddings/embd__tv____b',
('embedding','self_w2v','raw') : 'word_embeddings/embd__tv____r',
# trial domain
('datasets','trial') : 'datasets/dset___tri___',
('datasets','trial','random',0.2) : 'datasets/dset___tri_ran_0.2_',
('datasets','trial','balanced',0.2) : 'datasets/dset___tri_bal_0.2_',
('embedding','cv','trial','random',0.2,'bio') : 'word_embeddings/embd__cv_tri_ran_0.2_b',
('embedding','cv','trial','random',0.2,'raw') : 'word_embeddings/embd__cv_tri_ran_0.2_r',
('embedding','w2v','trial','random',0.2,'bio') : 'word_embeddings/embd__wv_tri_ran_0.2_b',
('embedding','w2v','trial','random',0.2,'raw') : 'word_embeddings/embd__wv_tri_ran_0.2_r',
('embedding','cv','trial','balanced',0.2,'bio') : 'word_embeddings/embd__cv_tri_bal_0.2_b',
('embedding','w2v','trial','balanced',0.2,'bio') : 'word_embeddings/embd__wv_tri_bal_0.2_b',
('embedding','self_w2v','trial','balanced',0.2,'bio') : 'word_embeddings/embd__tv_tri_bal_0.2_b',
('embedding','elmo','trial','balanced',0.2,'bio') : 'word_embeddings/embd__ev_tri_bal_0.2_b',
('model','svm','cv','trial','random',0.2,'bio') : 'models/modl_sv_cv_tri_ran_0.2_b',
......@@ -62,8 +64,6 @@ DATASET_NAMES = {
('embedding','cv','medical','random',0.2,'raw') : 'word_embeddings/embd__cv_med_ran_0.2_r',
('embedding','w2v','medical','random',0.2,'bio') : 'word_embeddings/embd__wv_med_ran_0.2_b',
('embedding','w2v','medical','random',0.2,'raw') : 'word_embeddings/embd__wv_med_ran_0.2_r',
('embedding','self_w2v','medical','random',0.2,'bio') : 'word_embeddings/embd__tv_med_ran_0.2_b',
('embedding','self_w2v','medical','random',0.2,'raw') : 'word_embeddings/embd__tv_med_ran_0.2_r',
('embedding','elmo','medical','random',0.2,'bio') : 'word_embeddings/embd__ev_med_ran_0.2_b',
('embedding','elmo','medical','random',0.2,'raw') : 'word_embeddings/embd__ev_med_ran_0.2_r',
......@@ -71,8 +71,6 @@ DATASET_NAMES = {
('embedding','cv','medical','balanced',0.2,'raw') : 'word_embeddings/embd__cv_med_bal_0.2_r',
('embedding','w2v','medical','balanced',0.2,'bio') : 'word_embeddings/embd__wv_med_bal_0.2_b',
('embedding','w2v','medical','balanced',0.2,'raw') : 'word_embeddings/embd__wv_med_bal_0.2_r',
('embedding','self_w2v','medical','balanced',0.2,'bio') : 'word_embeddings/embd__tv_med_bal_0.2_b',
('embedding','self_w2v','medical','balanced',0.2,'raw') : 'word_embeddings/embd__tv_med_bal_0.2_r',
('embedding','elmo','medical','balanced',0.2,'bio') : 'word_embeddings/embd__ev_med_bal_0.2_b',
('embedding','elmo','medical','balanced',0.2,'raw') : 'word_embeddings/embd__ev_med_bal_0.2_r',
......
......@@ -12,7 +12,7 @@ import numpy as np
import pandas as pd
import tensorflow_hub as hub
from config import CLASS_GROUP
from sampling import get_data_from_mongo
from sampling import get_data_from_mongo, get_distinct_field_values
from sklearn.svm import SVC
from config import DATASET_NAMES, MASKED, SEED
import numpy as np
......@@ -29,27 +29,34 @@ def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size,
elif embedding == 'w2v':
trained_embedding, X = pretrained_word2vec_fit_transform(x_list)
elif embedding == 'self_w2v':
trained_embedding, X = selftrained_word2vec_fit_transform(x_list)
trained_embedding, X = selftrained_word2vec_fit_transform(x_list,masking)
elif embedding == 'elmo':
trained_embedding, X = elmo_fit_transform(x_list)
X = elmo_transform(x_list)
dump(trained_embedding,DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]])
if embedding == 'self_w2v':
dump(trained_embedding,DATASET_NAMES['embedding','self_w2v',MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding','self_w2v',MASKED[masking]])
elif embedding != 'elmo':
dump(trained_embedding,DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]])
return(X)
def embedding_transform(x_list, embedding, class_group, sampling, test_size, masking ):
# print("processing preprocessing.embedding_transform ...")
trained_embedding = load(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if embedding == 'self_w2v':
trained_embedding = load(DATASET_NAMES['embedding','self_w2v',MASKED[masking]]+'.joblib')
elif embedding != 'elmo':
trained_embedding = load(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if embedding == 'cv':
X = count_vectorize_transform(trained_embedding,x_list)
elif embedding == 'w2v'or embedding == 'self_w2v':
X = word2vec_transform(trained_embedding,x_list)
elif embedding == 'elmo':
X = elmo_transform(trained_embedding, x_list)
X = elmo_transform(x_list)
return(X)
......@@ -123,18 +130,19 @@ def selftrained_word2vec_fit(corpus):
return model
def selftrained_word2vec_fit_transform(x_list):
def selftrained_word2vec_fit_transform(x_list,masking):
## To-Do
# change corpus to all possible texts
corpus = x_list
corpus = get_distinct_field_values(MASKED[masking])
model = selftrained_word2vec_fit(corpus)
X = word2vec_transform(model, x_list)
return model , X
def elmo_transform(elmo, x_list):
def elmo_transform(x_list):
elmo = hub.load("https://tfhub.dev/google/elmo/3")
X = []
for sent in x_list:
sent = preProcessAndTokenize(sent)
......@@ -143,13 +151,13 @@ def elmo_transform(elmo, x_list):
X.append(np.mean(embeddings['word_emb'],1).flatten())
return X
'''
def elmo_fit_transform(x_list):
elmo = hub.load("https://tfhub.dev/google/elmo/3")
X = elmo_transform(elmo,x_list)
return elmo , X
'''
......
......@@ -17,6 +17,19 @@ def get_data_from_mongo(class_group):
return(data)
def get_distinct_field_values(field_name):
print('processing sampling.get_distinct_field_values ...')
client = pymongo.MongoClient(MONGO_HOST)
collection = client[MONGO_DB][MONGO_COLLECTION]
return (collection.distinct(field_name))
def load_data(class_group, from_saved=True):
print('processing sampling.load_data ...')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment