Commit a2172afe authored by Sparsh Jauhari's avatar Sparsh Jauhari 💬
Browse files

Added code to debias word embedding

parent b826cce3
......@@ -17,7 +17,7 @@ CLASSES = ['physician',
]
""" embd__tv____r.joblib is word2vec model trained on our medical domain."""
model = load('embd__tv____r.joblib')
model = load('bias-in-bio-lab-cssh/word_embeddings/embd__tv____r.joblib')
vecs =[]
words = [w for w in model.index_to_key ]
vecs = [model[w] for w in words]
......@@ -29,13 +29,13 @@ def normalize(vecs):
"""Normalizes the vectors."""
vecs /= np.linalg.norm(vecs, axis=1)[:, np.newaxis]
with open('definitional_pairs.json', "r") as f:
with open('bias-in-bio-lab-cssh/word_embeddings/definitional_pairs.json', "r") as f:
"""The ten pairs of words used to define the gender direction.
The file can be found at: https://github.com/tolga-b/debiaswe/tree/10277b23e187ee4bd2b6872b507163ef4198686b/data"""
definitional_pairs = json.load(f)
with open('gender_specific_full.json', "r") as f:
with open('bias-in-bio-lab-cssh/word_embeddings/gender_specific_full.json', "r") as f:
""" A list of 1441 gender-specific words.
The file can be found at: https://github.com/tolga-b/debiaswe/tree/10277b23e187ee4bd2b6872b507163ef4198686b/data"""
gender_specific_words = json.load(f)
......@@ -86,4 +86,4 @@ def save_binary_file(filename, binary=True):
if binary:
fout.write(to_utf8(word) + b" " + row.tobytes())
"""Saves the debiased word embeddings as binary file"""
save_binary_file('Self_trained_word2vec_debiased.bin')
save_binary_file('bias-in-bio-lab-cssh/word_embeddings/Self_trained_word2vec_debiased.bin')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment