Commit 035e4bf6 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

code update, dump and load

parent a8e2b334
......@@ -2,16 +2,35 @@ MONGO_HOST = "mongodb+srv://root:Deployment123@clusterbiobias.4mc8e.mongodb.net/
MONGO_DB = 'biodb'
MONGO_COLLECTION = 'allbio'
TITLE = 'title'
CLASSES = [ 'physician',
CLASS_GROUP = {'medical' = [ 'physician',
'nurse',
'psychologist',
'dentist',
'surgeon',
'dietitian',
'chiropractor'
]
N_CLASSES = len(CLASSES)
]}
SEED = 414325
WORD2VEC_PATH = "word_embeddings/GoogleNews-vectors-negative300.bin"
\ No newline at end of file
WORD2VEC_PATH = "word_embeddings/GoogleNews-vectors-negative300.bin"
MASKED = { True:'bio',
False:'raw'
}
DATASET_NAMES = { # [class_group, sampling, test_size]
['datasets','medical'] : 'datasets/medical_dataset',
['datasets','medical','random','0.2'] : 'datasets/set_1',
['embedding','cv','medical','random',0.2,'bio'] : 'word_embeddings/embd_1',
['embedding','cv','medical','random',0.2,'raw'] : 'word_embeddings/embd_2',
['model','svm','cv','medical','random',0.2,'bio'] : 'models/model_1',
['model','svm','cv','medical','random',0.2,'raw'] : 'models/model_2'
['model','svm','w2v','medical','random',0.2,'bio'] : 'models/model_3',
['model','svm','w2v','medical','random',0.2,'raw'] : 'models/model_4'
}
\ No newline at end of file
from sklearn.svm import SVC
from config import DATASET_NAMES,MASKED
def svm(X_train, Y_train, X_test, Y_test):
def model_training(X_train, Y_train, model, embedding, class_group, sampling, test_size, masking):
if model == 'svm':
trained_model = svm_train(X_train, Y_train)
elif model == 'rf':
trained_model = rf_train(X_train, Y_train)
elif model == 'nn':
trained_model = nn_train(X_train, Y_train)
dump(trained_model, DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
def model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking):
trained_model = load(DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if model == 'svm':
svm_predict(X_test, Y_test, trained_model)
elif model == 'rf':
rf_predict(X_test, Y_test, trained_model)
elif model == 'nn':
nn_predict(X_test, Y_test, trained_model)
def load_model(model, embedding, class_group, sampling, test_size, masking)
return(load(DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib'))
def svm_train(X_train, Y_train):
classifier = SVC(C=1, kernel = 'linear', gamma = 'auto')
classifier.fit(X_train, Y_train)
return classifier
def svm_predict(X_test, Y_test, classifier)
prediction = classifier.predict(X_test)
print('\nPrediction accuracy on test set using Doc2Vec :',np.mean(prediction == Y_test))
return classifier
def rf(X_train, Y_train, X_test, Y_test):
def rf_train(X_train, Y_train):
'''
check again everything !!!!!!
'''
......@@ -50,9 +81,10 @@ def rf(X_train, Y_train, X_test, Y_test):
y_pred = np.vectorize(reversefactor.get)(df['raw_title'][11000:16000])
print('\nPrediction accuracy :',np.mean(y_pred == y_pred_test))
def rf_predict(X_test, Y_test, model):
## TO-DO:
def nn(X_train, Y_train, X_test, Y_test):
def nn_train(X_train, Y_train):
'''
check again everything !!!!!!
......@@ -80,4 +112,5 @@ def nn(X_train, Y_train, X_test, Y_test):
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
def nn_predict(X_test, Y_test, model):
## TO-DO:
......@@ -3,7 +3,7 @@ from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from config import WORD2VEC_PATH
from config import WORD2VEC_PATH, DATASET_NAMES, MASKED
import gensim
......@@ -11,6 +11,25 @@ nltk.download('punkt')
nltk.download('stopwords')
stop_words=set(stopwords.words("english"))
def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size, masking):
if embedding == 'cv':
trained_embedding, X = count_vectorize_fit_transform(x_list)
elif embedding == 'w2v':
trained_embedding, X = word2vec_fit_transform(x_list)
dump(trained_embedding,DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
return(X)
def embedding_transform(x_list, embedding, class_group, sampling, test_size, masking ):
trained_embedding = loads(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if embedding == 'cv':
X = count_vectorize_transform(trained_embedding,x_list)
elif embedding == 'w2v':
X = word2vec_transform(trained_embedding,x_list)
return(X)
''' Add this with every embedding
......@@ -31,21 +50,33 @@ def preProcessAndTokenize(corpus):
return stemmed
def count_vectorize(x_list):
def count_vectorize_fit_transform(x_list):
vectorizer_binary = CountVectorizer(lowercase=True, preprocessor=None, binary=True, stop_words=None, tokenizer=preProcessAndTokenize)
binaryDocumentMatrix = vectorizer_binary.fit_transform(x_list)
print("count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
return binaryDocumentMatrix
X = vectorizer_binary.fit_transform(x_list)
# print("count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
return vectorizer_binary, X
def count_vectorize_transform(vectorizer_binary,x_list):
X = vectorizer_binary.transform(x_list)
return X
def word2vec_embedding(x_list):
def word2vec_fit_transform(x_list):
## To-Do
# add preprocessing step
# check again
model = gensim.models.KeyedVectors.load_word2vec_format(modelPath, binary = True)
X = [[model[word] for word in sent] for sent in x_list]
return model , X
def word2vec_transform(model, x_list):
X = [[model[word] for word in sent] for sent in x_list]
return X
from sklearn.model_selection import train_test_split
import pymongo
from config import MONGO_HOST
from joblib import dump, load
def get_data_from_mongo(class_group):
client = pymongo.MongoClient(MONGO_HOST)
collection = client[MONGO_DB][MONGO_COLLECTION]
data = collection.find({'$or':[{'title':title} for title in CLASS_GROUP[class_group]]})
dump(data, DATASET_NAMES['datasets',class_group]+'.joblib')
return(data)
def load_data(from_saved=True):
if not from_saved:
data = get_data_from_mongo(class_group)
else:
data = load('datasets/'+DATASET_NAMES[class_group]+'.joblib')
return data
'''
sampling -> random, upsample, downsample, weighted
'''
def data_selection(data, sampling, test_size, masking=True):
if sampling == 'random':
train,test = train_test_split(data, test_size = test_size, random_state = SEED)
elif sampling == 'downsample':
# TO-DO:
train,test = None,None
else:
# TO-DO:
train,test = None,None
dump([train,test], DATASET_NAMES['datasets',class_group,sampling,test_size]+'.joblib')
return train,test
from config import CLASSES, N_CLASSES
from preprocessing import count_vectorize
from utils import get_data_from_mongo
'''
embedding : count_vectorize(self-trained), word2vec_embedding(pre-trained)
model : svm, rf, nn
'''
def train(data, embedding, training_model, masked=True):
## TO-DO:
# write generic train
# return trained_model
if masked:
X = embedding(data['bio'])
else:
X = embedding(data['raw'])
Y = data['raw_title']
model = training_model()
from config import CLASS_GROUP, TITLE, MASKED, SEED, DATASET_NAMES
from preprocessing import embedding_fit_transform,embedding_transform
from sampling import data_selection, load_data
from joblib import dump, load
from model import model_training,
def main(class_group, sampling, embedding, model, test_size, masking):
data = pd.DataFrame(load_data(from_saved=False))
train,test = data_selection(data, class_group, sampling, test_size, masking)
# training
X_train = embedding_fit_transform(train[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_train = train[TITLE]
model_training(X_train, Y_train, model, embedding, class_group, sampling, test_size, masking)
# prediction
X_test = embedding_transform(test[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_test = test[TITLE]
model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking)
data = get_data_from_mongo(CLASSES)
X_raw = count_vectorize(data['raw'])
X_bio = count_vectorize(data['bio'])
Y = data['raw_title']
X = X_raw
# X = X_bio
\ No newline at end of file
'''
sampling -> random, upsample, downsample, weighted
embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trained)
model -> svm, rf, nn
'''
if __name__ == "__main__":
main(class_group='medical', sampling='random', embedding='cv', model='svm', test_size=0.2, masking=True)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment