Commit 50ba55ea authored by Nishtha Jain's avatar Nishtha Jain
Browse files

running cv and svm pipeline

parent d2ff698b
......@@ -6,3 +6,7 @@ to add pretrained word embedding models, write in terminal:
`gzip -d GoogleNews-vectors-negative300.bin.gz`
install
gensim, pymongo, dnspython
\ No newline at end of file
......@@ -4,14 +4,17 @@ MONGO_COLLECTION = 'allbio'
TITLE = 'title'
CLASS_GROUP = {'medical' = [ 'physician',
'nurse',
'psychologist',
'dentist',
'surgeon',
'dietitian',
'chiropractor'
]}
CLASS_GROUP = {'medical' : ['physician',
'nurse',
'psychologist',
'dentist',
'surgeon',
'dietitian',
'chiropractor'
],
'trial' : ['yoga_teacher',
'personal_trainer']
}
SEED = 414325
......@@ -21,16 +24,33 @@ MASKED = { True:'bio',
False:'raw'
}
DATASET_NAMES = { # [class_group, sampling, test_size]
['datasets','medical'] : 'datasets/medical_dataset',
['datasets','medical','random','0.2'] : 'datasets/set_1',
['embedding','cv','medical','random',0.2,'bio'] : 'word_embeddings/embd_1',
['embedding','cv','medical','random',0.2,'raw'] : 'word_embeddings/embd_2',
['model','svm','cv','medical','random',0.2,'bio'] : 'models/model_1',
['model','svm','cv','medical','random',0.2,'raw'] : 'models/model_2'
['model','svm','w2v','medical','random',0.2,'bio'] : 'models/model_3',
['model','svm','w2v','medical','random',0.2,'raw'] : 'models/model_4'
DATASET_NAMES = {
# trial domain
('datasets','trial') : 'datasets/trial_dataset',
('datasets','trial','random',0.2) : 'datasets/trial_set_1',
('embedding','cv','trial','random',0.2,'bio') : 'word_embeddings/trial_embd_1',
('embedding','cv','trial','random',0.2,'raw') : 'word_embeddings/trial_embd_2',
('embedding','w2v','trial','random',0.2,'bio') : 'word_embeddings/trial_embd_3',
('embedding','w2v','trial','random',0.2,'raw') : 'word_embeddings/trial_embd_4',
('model','svm','cv','trial','random',0.2,'bio') : 'models/trial_model_1',
('model','svm','cv','trial','random',0.2,'raw') : 'models/trial_model_2',
('model','svm','w2v','trial','random',0.2,'bio') : 'models/trial_model_3',
('model','svm','w2v','trial','random',0.2,'raw') : 'models/trial_model_4',
# medical domain
('datasets','medical') : 'datasets/medical_dataset',
('datasets','medical','random',0.2) : 'datasets/set_1',
('embedding','cv','medical','random',0.2,'bio') : 'word_embeddings/embd_1',
('embedding','cv','medical','random',0.2,'raw') : 'word_embeddings/embd_2',
('embedding','w2v','medical','random',0.2,'bio') : 'word_embeddings/embd_3',
('embedding','w2v','medical','random',0.2,'raw') : 'word_embeddings/embd_4',
('model','svm','cv','medical','random',0.2,'bio') : 'models/model_1',
('model','svm','cv','medical','random',0.2,'raw') : 'models/model_2',
('model','svm','w2v','medical','random',0.2,'bio') : 'models/model_3',
('model','svm','w2v','medical','random',0.2,'raw') : 'models/model_4'
}
\ No newline at end of file
from config import CLASSES, MASKED, TITLE
from config import CLASS_GROUP, MASKED, TITLE
from model import model_prediction
from preprocessing import embedding_transform
import numpy as np
def tpr_gender_gap(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking):
trp = []
for c in CLASSES:
def tpr_gender_gap(test, model, embedding, class_group, sampling, test_size, masking):
print("processing evaluation.tpr_gender_gap ...")
tpr = []
for c in CLASS_GROUP[class_group]:
for gender in ['M','F']:
mini_set = full_test.loc[(full_test['gender']==gender) & (full_test[TITLE]==c)]
pred, acc = model_prediction(X_test = mini_set[MASKED[masking]], Y_test= mini_set[TITLE, model, embedding, class_group, sampling, test_size, masking)
# pred = classifier.predict(vectorizer_binary.transform(mini_set['bio']))
mini_set = test.loc[(test['gender']==gender) & (test[TITLE]==c)]
X_test = embedding_transform(mini_set[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
pred, acc = model_prediction(X_test=X_test, Y_test=mini_set[TITLE], model=model, embedding=embedding, class_group=class_group, sampling=sampling, test_size=test_size, masking=masking)
tp = np.sum(pred == mini_set[TITLE])
print({'class':c, 'gender':gender, 'tp':tp, 'count':len(pred), 'tpr :',float(tp)/len(pred)})
tpr.append({'class':c, 'gender':gender, 'tp':tp, 'count':len(pred), 'tpr :',float(tp)/len(pred)})
# print({'class':c, 'gender':gender, 'tp':tp, 'count':len(pred), 'tpr':float(tp)/len(pred)})
tpr.append({'class':c, 'gender':gender, 'tp':tp, 'count':len(pred), 'tpr':float(tp)/len(pred)})
return(tpr)
\ No newline at end of file
from sklearn.svm import SVC
from config import DATASET_NAMES,MASKED
from joblib import dump, load
import numpy as np
def model_training(X_train, Y_train, model, embedding, class_group, sampling, test_size, masking):
print("processing model.model_training ...")
if model == 'svm':
trained_model = svm_train(X_train, Y_train)
elif model == 'rf':
......@@ -14,6 +16,8 @@ def model_training(X_train, Y_train, model, embedding, class_group, sampling, te
def model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking):
print("processing model.model_prediction ...")
trained_model = load(DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if model == 'svm':
......@@ -25,21 +29,23 @@ def model_prediction(X_test, Y_test, model, embedding, class_group, sampling, te
return pred, acc
def load_model(model, embedding, class_group, sampling, test_size, masking)
def load_model(model, embedding, class_group, sampling, test_size, masking):
return(load(DATASET_NAMES['model',model,embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib'))
def svm_train(X_train, Y_train):
classifier = SVC(C=1, kernel = 'linear', gamma = 'auto')
print("shape of X_train",X_train.shape)
print("first instance of X_train", X_train[0])
classifier.fit(X_train, Y_train)
return classifier
def svm_predict(X_test, Y_test, classifier)
def svm_predict(X_test, Y_test, classifier):
prediction = classifier.predict(X_test)
acc = np.mean(prediction == Y_test)
print('\nPrediction accuracy on test set using Doc2Vec :',acc)
# print('\nPrediction accuracy on test set using Doc2Vec :',acc)
return prediction, acc
......
import nltk
from nltk.corpus import stopwords
# from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from config import WORD2VEC_PATH, DATASET_NAMES, MASKED
import gensim
# from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from joblib import dump, load
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
stop_words=set(stopwords.words("english"))
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words=set(nltk.corpus.stopwords.words("english"))
def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size, masking):
print("processing preprocessing.embedding_fit_transform ...")
if embedding == 'cv':
trained_embedding, X = count_vectorize_fit_transform(x_list)
elif embedding == 'w2v':
......@@ -22,8 +26,9 @@ def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size,
def embedding_transform(x_list, embedding, class_group, sampling, test_size, masking ):
print("processing preprocessing.embedding_transform ...")
trained_embedding = loads(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
trained_embedding = load(DATASET_NAMES['embedding',embedding,class_group,sampling,test_size,MASKED[masking]]+'.joblib')
if embedding == 'cv':
X = count_vectorize_transform(trained_embedding,x_list)
......@@ -38,9 +43,9 @@ def embedding_transform(x_list, embedding, class_group, sampling, test_size, mas
-> nltk english stop_words
-> SnowballStemmer
'''
def preProcessAndTokenize(corpus):
def preProcessAndTokenize(sentence):
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokenized_words=tokenizer.tokenize(corpus.lower())
tokenized_words=tokenizer.tokenize(sentence.lower())
filtered_words=[]
for w in tokenized_words:
if w.lower() not in stop_words:
......@@ -64,16 +69,17 @@ def count_vectorize_transform(vectorizer_binary,x_list):
def word2vec_fit_transform(x_list):
## To-Do
# add preprocessing step
# pre trained
# check again
model = gensim.models.KeyedVectors.load_word2vec_format(modelPath, binary = True)
X = [[model[word] for word in sent] for sent in x_list]
# model = Word2Vec.load(WORD2VEC_PATH)
model = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary = True)
X = np.array([[model[word] for word in filter(lambda x: x in model,preProcessAndTokenize(sent))] for sent in x_list])
return model , X
def word2vec_transform(model, x_list):
X = [[model[word] for word in sent] for sent in x_list]
X = np.array([[model[word] for word in filter(lambda x: x in model,preProcessAndTokenize(sent))] for sent in x_list])
return X
......
from sklearn.model_selection import train_test_split
import pymongo
from config import MONGO_HOST
from config import MONGO_HOST, MONGO_DB, MONGO_COLLECTION, CLASS_GROUP, DATASET_NAMES, SEED
from joblib import dump, load
def get_data_from_mongo(class_group):
print('processing sampling.get_data_from_mongo ...')
client = pymongo.MongoClient(MONGO_HOST)
collection = client[MONGO_DB][MONGO_COLLECTION]
data = collection.find({'$or':[{'title':title} for title in CLASS_GROUP[class_group]]})
data = list(collection.find({'$or':[{'title':title} for title in CLASS_GROUP[class_group]]}))
dump(data, DATASET_NAMES['datasets',class_group]+'.joblib')
return(data)
def load_data(from_saved=True):
def load_data(class_group, from_saved=True):
print('processing sampling.load_data ...')
if not from_saved:
data = get_data_from_mongo(class_group)
else:
data = load('datasets/'+DATASET_NAMES[class_group]+'.joblib')
data = load(DATASET_NAMES['datasets',class_group]+'.joblib')
return data
'''
sampling -> random, upsample, downsample, weighted
'''
def data_selection(data, sampling, test_size, masking=True):
def data_selection(data, class_group, sampling, test_size, masking=True):
print('processing sampling.data_selection ...')
if sampling == 'random':
train,test = train_test_split(data, test_size = test_size, random_state = SEED)
......
from config import CLASS_GROUP, TITLE, MASKED, SEED, DATASET_NAMES
from preprocessing import embedding_fit_transform,embedding_transform
from preprocessing import embedding_fit_transform, embedding_transform
from sampling import data_selection, load_data
from joblib import dump, load
from model import model_training,
from model import model_training, model_prediction
from evaluation import tpr_gender_gap
import pandas as pd
def main(class_group, sampling, embedding, model, test_size, masking):
def main(load_data_from_saved, model_train, embedding_train, predict, evaluate, class_group, sampling, embedding, model, test_size, masking):
data = pd.DataFrame(load_data(from_saved=False))
print("\nprocessing train.main ...")
data = pd.DataFrame(load_data(class_group=class_group, from_saved=load_data_from_saved))
train,test = data_selection(data, class_group, sampling, test_size, masking)
train_set,test_set = data_selection(data, class_group, sampling, test_size, masking)
# training
X_train = embedding_fit_transform(train[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_train = train[TITLE]
model_training(X_train, Y_train, model, embedding, class_group, sampling, test_size, masking)
if model_train:
if embedding_train:
X_train = embedding_fit_transform(train_set[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
else:
X_train = embedding_transform(train_set[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_train = train_set[TITLE]
model_training(X_train, Y_train, model, embedding, class_group, sampling, test_size, masking)
# prediction
X_test = embedding_transform(test[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_test = test[TITLE]
pred, acc = model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking)
res = tpr_gender_gap(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking)
if predict:
X_test = embedding_transform(test_set[MASKED[masking]], embedding, class_group, sampling, test_size, masking)
Y_test = test_set[TITLE]
pred, acc = model_prediction(X_test, Y_test, model, embedding, class_group, sampling, test_size, masking)
print("\nModel accuracy:", acc)
# evaluation
if evaluate:
res = tpr_gender_gap(test_set, model, embedding, class_group, sampling, test_size, masking)
for item in res:
print(item)
'''
......@@ -30,5 +43,18 @@ embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trai
model -> svm, rf, nn
'''
if __name__ == "__main__":
main(class_group='medical', sampling='random', embedding='cv', model='svm', test_size=0.2, masking=True)
import time
start_time = time.time()
main(load_data_from_saved = True,
model_train = True,
embedding_train = True,
predict = True,
evaluate = True,
class_group = 'trial',
sampling = 'random',
embedding = 'cv',
model = 'svm',
test_size = 0.2,
masking = True)
print("--- %s seconds ---" % (time.time() - start_time))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment