Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Sparsh Jauhari
Bias in Bio Lab CSSH
Commits
456db499
Commit
456db499
authored
Jun 26, 2021
by
Nishtha Jain
Browse files
svm-w2v and self_w2v added
parent
1d39aba7
Changes
5
Hide whitespace changes
Inline
Side-by-side
config.py
View file @
456db499
...
...
@@ -25,32 +25,52 @@ MASKED = { True:'bio',
}
DATASET_NAMES
=
{
# Naming convention : [dset|embd|modl]_[sv|rf|nn]_[cv|wv]_[tri|med]_[ran|bal]_[test_spit]_[b|r]
# trial domain
(
'datasets'
,
'trial'
)
:
'datasets/trial_dataset'
,
(
'datasets'
,
'trial'
,
'random'
,
0.2
)
:
'datasets/trial_set_1'
,
(
'datasets'
,
'trial'
)
:
'datasets/dset___tri___'
,
(
'datasets'
,
'trial'
,
'random'
,
0.2
)
:
'datasets/dset___tri_ran_0.2_'
,
(
'datasets'
,
'trial'
,
'balanced'
,
0.2
)
:
'datasets/dset___tri_bal_0.2_'
,
(
'embedding'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__cv_tri_ran_0.2_b'
,
(
'embedding'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/embd__cv_tri_ran_0.2_r'
,
(
'embedding'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__wv_tri_ran_0.2_b'
,
(
'embedding'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/embd__wv_tri_ran_0.2_r'
,
(
'embedding'
,
'cv'
,
'trial'
,
'balanced'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__cv_tri_bal_0.2_b'
,
(
'embedding'
,
'w2v'
,
'trial'
,
'balanced'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__wv_tri_bal_0.2_b'
,
(
'embedding'
,
'self_w2v'
,
'trial'
,
'balanced'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__tv_tri_bal_0.2_b'
,
(
'embedding'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/trial_embd_1'
,
(
'embedding'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/trial_embd_2'
,
(
'embedding'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/trial_embd_3'
,
(
'embedding'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/trial_embd_4'
,
(
'model'
,
'svm'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'models/modl_sv_cv_tri_ran_0.2_b'
,
(
'model'
,
'svm'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'models/modl_sv_cv_tri_ran_0.2_r'
,
(
'model'
,
'svm'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'models/modl_sv_wv_tri_ran_0.2_b'
,
(
'model'
,
'svm'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'models/modl_sv_wv_tri_ran_0.2_r'
,
(
'model'
,
'svm'
,
'cv'
,
'trial'
,
'balanced'
,
0.2
,
'bio'
)
:
'models/modl_sv_cv_tri_bal_0.2_b'
,
(
'model'
,
'svm'
,
'w2v'
,
'trial'
,
'balanced'
,
0.2
,
'bio'
)
:
'models/modl_sv_wv_tri_bal_0.2_b'
,
(
'model'
,
'svm'
,
'self_w2v'
,
'trial'
,
'balanced'
,
0.2
,
'bio'
)
:
'models/modl_sv_tv_tri_bal_0.2_b'
,
(
'model'
,
'svm'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'models/trial_model_1'
,
(
'model'
,
'svm'
,
'cv'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'models/trial_model_2'
,
(
'model'
,
'svm'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'bio'
)
:
'models/trial_model_3'
,
(
'model'
,
'svm'
,
'w2v'
,
'trial'
,
'random'
,
0.2
,
'raw'
)
:
'models/trial_model_4'
,
# medical domain
(
'datasets'
,
'medical'
)
:
'datasets/medical_dataset'
,
(
'datasets'
,
'medical'
,
'random'
,
0.2
)
:
'datasets/set_1'
,
(
'embedding'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/embd_1'
,
(
'embedding'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/embd_2'
,
(
'embedding'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/embd_3'
,
(
'embedding'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/embd_4'
,
(
'model'
,
'svm'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'models/model_1'
,
(
'model'
,
'svm'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'models/model_2'
,
(
'model'
,
'svm'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'models/model_3'
,
(
'model'
,
'svm'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'models/model_4'
(
'datasets'
,
'medical'
)
:
'datasets/dset___med___'
,
(
'datasets'
,
'medical'
,
'random'
,
0.2
)
:
'datasets/dset___med_ran_0.2_'
,
(
'datasets'
,
'medical'
,
'balanced'
,
0.2
)
:
'datasets/dset___med_bal_0.2_'
,
(
'embedding'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__cv_med_ran_0.2_b'
,
(
'embedding'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/embd__cv_med_ran_0.2_r'
,
(
'embedding'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__wv_med_ran_0.2_b'
,
(
'embedding'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'word_embeddings/embd__wv_med_ran_0.2_r'
,
(
'embedding'
,
'cv'
,
'medical'
,
'balanced'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__cv_med_bal_0.2_b'
,
(
'embedding'
,
'cv'
,
'medical'
,
'balanced'
,
0.2
,
'raw'
)
:
'word_embeddings/embd__cv_med_bal_0.2_r'
,
(
'embedding'
,
'w2v'
,
'medical'
,
'balanced'
,
0.2
,
'bio'
)
:
'word_embeddings/embd__wv_med_bal_0.2_b'
,
(
'embedding'
,
'w2v'
,
'medical'
,
'balanced'
,
0.2
,
'raw'
)
:
'word_embeddings/embd__wv_med_bal_0.2_r'
,
(
'model'
,
'svm'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'models/modl_sv_cv_med_ran_0.2_b'
,
(
'model'
,
'svm'
,
'cv'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'models/modl_sv_cv_med_ran_0.2_r'
,
(
'model'
,
'svm'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'bio'
)
:
'models/modl_sv_wv_med_ran_0.2_b'
,
(
'model'
,
'svm'
,
'w2v'
,
'medical'
,
'random'
,
0.2
,
'raw'
)
:
'models/modl_sv_wv_med_ran_0.2_r'
,
(
'model'
,
'svm'
,
'cv'
,
'medical'
,
'balanced'
,
0.2
,
'bio'
)
:
'models/modl_sv_cv_med_bal_0.2_b'
,
(
'model'
,
'svm'
,
'cv'
,
'medical'
,
'balanced'
,
0.2
,
'raw'
)
:
'models/modl_sv_cv_med_bal_0.2_r'
,
(
'model'
,
'svm'
,
'w2v'
,
'medical'
,
'balanced'
,
0.2
,
'bio'
)
:
'models/modl_sv_wv_med_bal_0.2_b'
,
(
'model'
,
'svm'
,
'w2v'
,
'medical'
,
'balanced'
,
0.2
,
'raw'
)
:
'models/modl_sv_wv_med_bal_0.2_r'
}
\ No newline at end of file
model.py
View file @
456db499
from
sklearn.svm
import
SVC
from
config
import
DATASET_NAMES
,
MASKED
from
config
import
DATASET_NAMES
,
MASKED
,
SEED
from
joblib
import
dump
,
load
import
numpy
as
np
def
model_training
(
X_train
,
Y_train
,
model
,
embedding
,
class_group
,
sampling
,
test_size
,
masking
):
print
(
"processing model.model_training ..."
)
if
model
==
'svm'
:
trained_model
=
svm_train
(
X_train
,
Y_train
)
trained_model
=
svm_train
(
X_train
,
Y_train
,
sampling
)
elif
model
==
'rf'
:
trained_model
=
rf_train
(
X_train
,
Y_train
)
elif
model
==
'nn'
:
trained_model
=
nn_train
(
X_train
,
Y_train
)
dump
(
trained_model
,
DATASET_NAMES
[
'model'
,
model
,
embedding
,
class_group
,
sampling
,
test_size
,
MASKED
[
masking
]]
+
'.joblib'
)
print
(
"
\t
saving file :"
,
DATASET_NAMES
[
'model'
,
model
,
embedding
,
class_group
,
sampling
,
test_size
,
MASKED
[
masking
]])
def
model_prediction
(
X_test
,
Y_test
,
model
,
embedding
,
class_group
,
sampling
,
test_size
,
masking
):
...
...
@@ -34,10 +35,16 @@ def load_model(model, embedding, class_group, sampling, test_size, masking):
def
svm_train
(
X_train
,
Y_train
):
classifier
=
SVC
(
C
=
1
,
kernel
=
'linear'
,
gamma
=
'auto'
)
def
svm_train
(
X_train
,
Y_train
,
sampling
):
if
sampling
==
'balanced'
:
class_weight
=
'balanced'
elif
sampling
==
'random'
:
class_weight
=
None
classifier
=
SVC
(
C
=
1
,
kernel
=
'linear'
,
gamma
=
'auto'
,
class_weight
=
class_weight
,
random_state
=
SEED
)
print
(
"shape of X_train"
,
X_train
.
shape
)
print
(
"first instance of X_train"
,
X_train
[
0
])
print
(
"first instance of X_train"
,
type
(
X_train
[
0
]),
X_train
[
0
])
classifier
.
fit
(
X_train
,
Y_train
)
return
classifier
...
...
preprocessing.py
View file @
456db499
...
...
@@ -5,7 +5,7 @@ from sklearn.linear_model import LogisticRegression
from
sklearn.feature_extraction.text
import
CountVectorizer
from
config
import
WORD2VEC_PATH
,
DATASET_NAMES
,
MASKED
# from gensim.models import Word2Vec
from
gensim.models
import
KeyedVectors
from
gensim.models
import
KeyedVectors
,
Word2Vec
from
joblib
import
dump
,
load
import
numpy
as
np
...
...
@@ -19,9 +19,13 @@ def embedding_fit_transform(x_list, embedding, class_group, sampling, test_size,
if
embedding
==
'cv'
:
trained_embedding
,
X
=
count_vectorize_fit_transform
(
x_list
)
elif
embedding
==
'w2v'
:
trained_embedding
,
X
=
word2vec_fit_transform
(
x_list
)
trained_embedding
,
X
=
pretrained_word2vec_fit_transform
(
x_list
)
elif
embedding
==
'self_w2v'
:
trained_embedding
,
X
=
selftrained_word2vec_fit_transform
(
x_list
)
dump
(
trained_embedding
,
DATASET_NAMES
[
'embedding'
,
embedding
,
class_group
,
sampling
,
test_size
,
MASKED
[
masking
]]
+
'.joblib'
)
print
(
"
\t
saving file :"
,
DATASET_NAMES
[
'embedding'
,
embedding
,
class_group
,
sampling
,
test_size
,
MASKED
[
masking
]])
return
(
X
)
...
...
@@ -32,7 +36,7 @@ def embedding_transform(x_list, embedding, class_group, sampling, test_size, mas
if
embedding
==
'cv'
:
X
=
count_vectorize_transform
(
trained_embedding
,
x_list
)
elif
embedding
==
'w2v'
:
elif
embedding
==
'w2v'
or
embedding
==
'self_w2v'
:
X
=
word2vec_transform
(
trained_embedding
,
x_list
)
return
(
X
)
...
...
@@ -58,7 +62,7 @@ def preProcessAndTokenize(sentence):
def
count_vectorize_fit_transform
(
x_list
):
vectorizer_binary
=
CountVectorizer
(
lowercase
=
True
,
preprocessor
=
None
,
binary
=
True
,
stop_words
=
None
,
tokenizer
=
preProcessAndTokenize
)
X
=
vectorizer_binary
.
fit_transform
(
x_list
)
#
print("count vectorized with dimension : ",len(vectorizer_binary.get_feature_names()))
print
(
"
count vectorized with dimension : "
,
len
(
vectorizer_binary
.
get_feature_names
()))
return
vectorizer_binary
,
X
...
...
@@ -67,20 +71,56 @@ def count_vectorize_transform(vectorizer_binary,x_list):
return
X
def
word2vec_fit_transform
(
x_list
):
## To-Do
class
MeanEmbeddingVectorizer
(
object
):
def
__init__
(
self
,
word2vec
):
self
.
word2vec
=
word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
self
.
dim
=
len
(
word2vec
.
itervalues
().
next
())
def
fit
(
self
,
X
,
y
):
return
self
def
transform
(
self
,
X
):
return
np
.
array
([
np
.
mean
([
self
.
word2vec
[
w
]
for
w
in
words
if
w
in
self
.
word2vec
]
or
[
np
.
zeros
(
self
.
dim
)],
axis
=
0
)
for
words
in
X
])
def
word2vec_transform
(
model
,
x_list
):
X
=
np
.
array
([
np
.
mean
([
model
[
word
]
for
word
in
filter
(
lambda
x
:
x
in
model
,
preProcessAndTokenize
(
sent
))],
axis
=
0
)
for
sent
in
x_list
])
return
X
def
pretrained_word2vec_fit_transform
(
x_list
):
# pre trained
# check again
# model = Word2Vec.load(WORD2VEC_PATH)
model
=
KeyedVectors
.
load_word2vec_format
(
WORD2VEC_PATH
,
binary
=
True
)
X
=
np
.
array
([[
model
[
word
]
for
word
in
filter
(
lambda
x
:
x
in
model
,
preProcessAndTokenize
(
sent
))]
for
sent
in
x_list
])
X
=
word2vec_transform
(
model
,
x_list
)
return
model
,
X
def
selftrained_word2vec_fit
(
corpus
):
tokensed_corpus
=
[
preProcessAndTokenize
(
sent
)
for
sent
in
corpus
]
model
=
Word2Vec
(
tokensed_corpus
,
vector_size
=
100
).
wv
return
model
def
selftrained_word2vec_fit_transform
(
x_list
):
## To-Do
# change corpus to all possible texts
corpus
=
x_list
model
=
selftrained_word2vec_fit
(
corpus
)
X
=
word2vec_transform
(
model
,
x_list
)
return
model
,
X
def
word2vec_transform
(
model
,
x_list
):
X
=
np
.
array
([[
model
[
word
]
for
word
in
filter
(
lambda
x
:
x
in
model
,
preProcessAndTokenize
(
sent
))]
for
sent
in
x_list
])
return
X
...
...
sampling.py
View file @
456db499
...
...
@@ -13,6 +13,7 @@ def get_data_from_mongo(class_group):
data
=
list
(
collection
.
find
({
'$or'
:[{
'title'
:
title
}
for
title
in
CLASS_GROUP
[
class_group
]]}))
dump
(
data
,
DATASET_NAMES
[
'datasets'
,
class_group
]
+
'.joblib'
)
print
(
"
\t
saving file : "
,
DATASET_NAMES
[
'datasets'
,
class_group
])
return
(
data
)
...
...
@@ -26,22 +27,21 @@ def load_data(class_group, from_saved=True):
return
data
'''
sampling -> random, upsample, downsample, weighted
sampling -> random,
balanced(
upsample, downsample, weighted
)
'''
def
data_selection
(
data
,
class_group
,
sampling
,
test_size
,
masking
=
True
):
print
(
'processing sampling.data_selection ...'
)
if
sampling
==
'random'
:
if
sampling
==
'random'
:
train
,
test
=
train_test_split
(
data
,
test_size
=
test_size
,
random_state
=
SEED
)
# same for now
elif
sampling
==
'balanced'
:
train
,
test
=
train_test_split
(
data
,
test_size
=
test_size
,
random_state
=
SEED
)
elif
sampling
==
'downsample'
:
# TO-DO:
train
,
test
=
None
,
None
else
:
# TO-DO:
train
,
test
=
None
,
None
dump
([
train
,
test
],
DATASET_NAMES
[
'datasets'
,
class_group
,
sampling
,
test_size
]
+
'.joblib'
)
print
(
"
\t
saving file : "
,
DATASET_NAMES
[
'datasets'
,
class_group
,
sampling
,
test_size
])
return
train
,
test
train.py
View file @
456db499
...
...
@@ -6,7 +6,7 @@ from evaluation import tpr_gender_gap
import
pandas
as
pd
def
main
(
load_data_from_saved
,
model_train
,
embedding
_train
,
predict
,
evaluate
,
class_group
,
sampling
,
embedding
,
model
,
test_size
,
masking
):
def
main
(
load_data_from_saved
,
embedding_train
,
model
_train
,
predict
,
evaluate
,
class_group
,
sampling
,
embedding
,
model
,
test_size
,
masking
):
print
(
"
\n
processing train.main ..."
)
data
=
pd
.
DataFrame
(
load_data
(
class_group
=
class_group
,
from_saved
=
load_data_from_saved
))
...
...
@@ -38,8 +38,8 @@ def main(load_data_from_saved, model_train, embedding_train, predict, evaluate,
'''
sampling -> random, upsample, downsample, weighted
embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trained)
sampling -> random,
balanced(
upsample, downsample, weighted
)
embedding -> cv: count_vectorize(self-trained), w2v: word2vec_embedding(pre-trained)
, self_w2v:w2v(self-trained)
model -> svm, rf, nn
'''
if
__name__
==
"__main__"
:
...
...
@@ -47,14 +47,14 @@ if __name__ == "__main__":
start_time
=
time
.
time
()
main
(
load_data_from_saved
=
True
,
model_train
=
True
,
embedding_train
=
True
,
model_train
=
True
,
predict
=
True
,
evaluate
=
Tru
e
,
evaluate
=
Fals
e
,
class_group
=
'trial'
,
sampling
=
'
random
'
,
embedding
=
'
c
v'
,
sampling
=
'
balanced
'
,
embedding
=
'
self_w2
v'
,
model
=
'svm'
,
test_size
=
0.2
,
masking
=
True
)
print
(
"--- %s seconds ---"
%
(
time
.
time
()
-
start_time
))
print
(
"
\n
--- %s seconds ---"
%
(
time
.
time
()
-
start_time
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment