Aufgrund einer Wartung wird GitLab am 18.01. zwischen 8:00 und 9:00 Uhr kurzzeitig nicht zur Verfügung stehen. / Due to maintenance, GitLab will be temporarily unavailable on 18.01. between 8:00 and 9:00 am.

Commit 9936dc72 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

w2v saving removed

parent a695e7f9
......@@ -39,11 +39,11 @@ DATASET_NAMES = {
('embedding','cv','trial','random',0.2,'bio') : 'word_embeddings/embd__cv_tri_ran_0.2_b',
('embedding','cv','trial','random',0.2,'raw') : 'word_embeddings/embd__cv_tri_ran_0.2_r',
('embedding','w2v','trial','random',0.2,'bio') : 'word_embeddings/embd__wv_tri_ran_0.2_b',
('embedding','w2v','trial','random',0.2,'raw') : 'word_embeddings/embd__wv_tri_ran_0.2_r',
# ('embedding','w2v','trial','random',0.2,'bio') : 'word_embeddings/embd__wv_tri_ran_0.2_b',
# ('embedding','w2v','trial','random',0.2,'raw') : 'word_embeddings/embd__wv_tri_ran_0.2_r',
('embedding','cv','trial','balanced',0.2,'bio') : 'word_embeddings/embd__cv_tri_bal_0.2_b',
('embedding','w2v','trial','balanced',0.2,'bio') : 'word_embeddings/embd__wv_tri_bal_0.2_b',
('embedding','elmo','trial','balanced',0.2,'bio') : 'word_embeddings/embd__ev_tri_bal_0.2_b', ## needed???
# ('embedding','w2v','trial','balanced',0.2,'bio') : 'word_embeddings/embd__wv_tri_bal_0.2_b',
# ('embedding','elmo','trial','balanced',0.2,'bio') : 'word_embeddings/embd__ev_tri_bal_0.2_b', ## needed???
('model','svm','cv','trial','random',0.2,'bio') : 'models/modl_sv_cv_tri_ran_0.2_b',
('model','svm','cv','trial','random',0.2,'raw') : 'models/modl_sv_cv_tri_ran_0.2_r',
......@@ -64,17 +64,17 @@ DATASET_NAMES = {
('embedding','cv','medical','random',0.2,'bio') : 'word_embeddings/embd__cv_med_ran_0.2_b',
('embedding','cv','medical','random',0.2,'raw') : 'word_embeddings/embd__cv_med_ran_0.2_r',
('embedding','w2v','medical','random',0.2,'bio') : 'word_embeddings/embd__wv_med_ran_0.2_b',
('embedding','w2v','medical','random',0.2,'raw') : 'word_embeddings/embd__wv_med_ran_0.2_r',
('embedding','elmo','medical','random',0.2,'bio') : 'word_embeddings/embd__ev_med_ran_0.2_b',
('embedding','elmo','medical','random',0.2,'raw') : 'word_embeddings/embd__ev_med_ran_0.2_r',
# ('embedding','w2v','medical','random',0.2,'bio') : 'word_embeddings/embd__wv_med_ran_0.2_b',
# ('embedding','w2v','medical','random',0.2,'raw') : 'word_embeddings/embd__wv_med_ran_0.2_r',
# ('embedding','elmo','medical','random',0.2,'bio') : 'word_embeddings/embd__ev_med_ran_0.2_b',
# ('embedding','elmo','medical','random',0.2,'raw') : 'word_embeddings/embd__ev_med_ran_0.2_r',
('embedding','cv','medical','balanced',0.2,'bio') : 'word_embeddings/embd__cv_med_bal_0.2_b',
('embedding','cv','medical','balanced',0.2,'raw') : 'word_embeddings/embd__cv_med_bal_0.2_r',
('embedding','w2v','medical','balanced',0.2,'bio') : 'word_embeddings/embd__wv_med_bal_0.2_b',
('embedding','w2v','medical','balanced',0.2,'raw') : 'word_embeddings/embd__wv_med_bal_0.2_r',
('embedding','elmo','medical','balanced',0.2,'bio') : 'word_embeddings/embd__ev_med_bal_0.2_b',
('embedding','elmo','medical','balanced',0.2,'raw') : 'word_embeddings/embd__ev_med_bal_0.2_r',
# ('embedding','w2v','medical','balanced',0.2,'bio') : 'word_embeddings/embd__wv_med_bal_0.2_b',
# ('embedding','w2v','medical','balanced',0.2,'raw') : 'word_embeddings/embd__wv_med_bal_0.2_r',
# ('embedding','elmo','medical','balanced',0.2,'bio') : 'word_embeddings/embd__ev_med_bal_0.2_b',
# ('embedding','elmo','medical','balanced',0.2,'raw') : 'word_embeddings/embd__ev_med_bal_0.2_r',
('model','svm','cv','medical','random',0.2,'bio') : 'models/modl_sv_cv_med_ran_0.2_b',
('model','svm','cv','medical','random',0.2,'raw') : 'models/modl_sv_cv_med_ran_0.2_r',
......
......@@ -7,10 +7,10 @@ def model_training(X_train, Y_train, model, embedding, class_group, sampling, te
print("processing model.model_training ...")
if model == 'svm':
trained_model = svm_train(X_train, Y_train, sampling)
elif model == 'rf':
trained_model = rf_train(X_train, Y_train)
elif model == 'nn':
trained_model = nn_train(X_train, Y_train)
# elif model == 'rf':
# trained_model = rf_train(X_train, Y_train)
# elif model == 'nn':
# trained_model = nn_train(X_train, Y_train)
print("\t saving file :",DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]])
dump(trained_model, DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
......@@ -56,7 +56,7 @@ def svm_predict(X_test, Y_test, classifier):
return prediction, acc
'''
def rf_train(X_train, Y_train):
'''
check again everything !!!!!!
......@@ -136,3 +136,4 @@ def nn_predict(X_test, Y_test, model):
prediction = None
acc = None
return prediction, acc
'''
\ No newline at end of file
......@@ -26,35 +26,37 @@ def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size,
if embedding == 'cv':
trained_embedding, X = count_vectorize_fit_transform(x_list)
elif embedding == 'w2v':
trained_embedding, X = pretrained_word2vec_fit_transform(x_list)
dump(trained_embedding,DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]])
elif embedding == 'self_w2v':
trained_embedding, X = selftrained_word2vec_fit_transform(x_list,masking)
elif embedding == 'elmo':
X = elmo_transform(x_list)
if embedding == 'self_w2v':
dump(trained_embedding,DATASET_NAMES['embedding','self_w2v',MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding','self_w2v',MASKED[masking]])
elif embedding != 'elmo':
dump(trained_embedding,DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
print("\t saving file :",DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]])
elif embedding == 'w2v':
X = pretrained_word2vec_transform(x_list)
elif embedding == 'elmo':
X = elmo_transform(x_list)
return(X)
def embedding_transform(x_list, embedding, class_group, sampling, test_size, masking ):
# print("processing preprocessing.embedding_transform ...")
if embedding == 'self_w2v':
trained_embedding = load(DATASET_NAMES['embedding','self_w2v',MASKED[masking]]+'.joblib')
elif embedding != 'elmo':
trained_embedding = load(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if embedding == 'cv':
trained_embedding = load(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
X = count_vectorize_transform(trained_embedding,x_list)
elif embedding == 'w2v'or embedding == 'self_w2v':
elif embedding == 'self_w2v':
trained_embedding = load(DATASET_NAMES['embedding','self_w2v',MASKED[masking]]+'.joblib')
X = word2vec_transform(trained_embedding,x_list)
elif embedding == 'w2v':
X = pretrained_word2vec_transform(x_list)
elif embedding == 'elmo':
X = elmo_transform(x_list)
return(X)
......@@ -114,14 +116,14 @@ def word2vec_transform(model, x_list):
return X
def pretrained_word2vec_fit_transform(x_list):
def pretrained_word2vec_transform(x_list):
# pre trained
# check again
# model = Word2Vec.load(WORD2VEC_PATH)
model = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary = True)
X = word2vec_transform(model, x_list)
return model , X
return X
def selftrained_word2vec_fit(corpus):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment