Commit 0399f688 authored by Nishtha Jain's avatar Nishtha Jain
Browse files

ipnb cleaned

parent 65a259ba
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# pip installations # pip installations
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#!pip install pymongo #!pip install pymongo
#!pip install dnspython==2.0.0 #!pip install dnspython==2.0.0
!pip install gensim !pip install gensim
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Config ## Config
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
CLASSES = ['physician', CLASSES = ['physician',
'nurse', 'nurse',
'psychologist', 'psychologist',
'dentist', 'dentist',
'surgeon', 'surgeon',
'dietitian', 'dietitian',
'chiropractor' 'chiropractor'
] ]
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## BIOS.pkl pickle insight ## BIOS.pkl pickle insight
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pickle import pickle
with open("datasets/BIOS.pkl","rb") as file: with open("datasets/BIOS.pkl","rb") as file:
data = pickle.load(file) data = pickle.load(file)
# title = set() # title = set()
# raw_title = set() # raw_title = set()
# gender = set() # gender = set()
# for x in data: # for x in data:
# title.add(x['title']) # title.add(x['title'])
# raw_title.add(x['raw_title']) # raw_title.add(x['raw_title'])
# gender.add(x['gender']) # gender.add(x['gender'])
print("number data points: ",len(data)) print("number data points: ",len(data))
print("structure of a data point [0] : \n",data[0]) print("structure of a data point [0] : \n",data[0])
print("types of gender:",gender) print("types of gender:",gender)
print("types of title:",len(title)) print("types of title:",len(title))
print(title) print(title)
print("types of raw_title:",len(raw_title)) print("types of raw_title:",len(raw_title))
print(raw_title) print(raw_title)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Mongo Connection using pymongo ## Mongo Connection using pymongo
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pymongo import pymongo
client = pymongo.MongoClient("mongodb+srv://root:Deployment123@clusterbiobias.4mc8e.mongodb.net/myFirstDatabase?retryWrites=true&w=majority") client = pymongo.MongoClient("mongodb+srv://root:Deployment123@clusterbiobias.4mc8e.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
collection = client['biodb']['allbio'] collection = client['biodb']['allbio']
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(len(collection.distinct("title"))) print(len(collection.distinct("title")))
print(collection.distinct("title")) print(collection.distinct("title"))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from pandas import DataFrame from pandas import DataFrame
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df = DataFrame(collection.find({'$or':[{'title':'teacher'},{'title':'professor'}]})) df = DataFrame(collection.find({'$or':[{'title':'teacher'},{'title':'professor'}]}))
print("Teacher male",len(df.loc[(df['title']=='teacher') & (df['gender']=='M')])) print("Teacher male",len(df.loc[(df['title']=='teacher') & (df['gender']=='M')]))
print("Professor male",len(df.loc[(df['title']=='professor') & (df['gender']=='M')])) print("Professor male",len(df.loc[(df['title']=='professor') & (df['gender']=='M')]))
print("Teacher female",len(df.loc[(df['title']=='teacher') & (df['gender']=='F')])) print("Teacher female",len(df.loc[(df['title']=='teacher') & (df['gender']=='F')]))
print("Professor female",len(df.loc[(df['title']=='professor') & (df['gender']=='F')])) print("Professor female",len(df.loc[(df['title']=='professor') & (df['gender']=='F')]))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df = DataFrame(collection.find({'$or':[{'title':'surgeon'},{'title':'nurse'}]})) df = DataFrame(collection.find({'$or':[{'title':'surgeon'},{'title':'nurse'}]}))
print("surgeon male",len(df.loc[(df['title']=='surgeon') & (df['gender']=='M')])) print("surgeon male",len(df.loc[(df['title']=='surgeon') & (df['gender']=='M')]))
print("nurse male",len(df.loc[(df['title']=='nurse') & (df['gender']=='M')])) print("nurse male",len(df.loc[(df['title']=='nurse') & (df['gender']=='M')]))
print("surgeon female",len(df.loc[(df['title']=='surgeon') & (df['gender']=='F')])) print("surgeon female",len(df.loc[(df['title']=='surgeon') & (df['gender']=='F')]))
print("nurse female",len(df.loc[(df['title']=='nurse') & (df['gender']=='F')])) print("nurse female",len(df.loc[(df['title']=='nurse') & (df['gender']=='F')]))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df = DataFrame(collection.find({'$or':[{'title':title} for title in CLASSES]})) df = DataFrame(collection.find({'$or':[{'title':title} for title in CLASSES]}))
df df
``` ```
%%%% Output: execute_result %%%% Output: execute_result
_id \ _id \
0 607b0639c53ee5775fe7758b 0 607b0639c53ee5775fe7758b
1 607b0639c53ee5775fe77590 1 607b0639c53ee5775fe77590
2 607b0639c53ee5775fe7759b 2 607b0639c53ee5775fe7759b
3 607b063ac53ee5775fe775b0 3 607b063ac53ee5775fe775b0
4 607b063ac53ee5775fe775b3 4 607b063ac53ee5775fe775b3
... ... ... ...
112458 607b1d5dc53ee5775fed7952 112458 607b1d5dc53ee5775fed7952
112459 607b1d5dc53ee5775fed795a 112459 607b1d5dc53ee5775fed795a
112460 607b1d5dc53ee5775fed795b 112460 607b1d5dc53ee5775fed795b
112461 607b1d5dc53ee5775fed7964 112461 607b1d5dc53ee5775fed7964
112462 607b1d5dc53ee5775fed7967 112462 607b1d5dc53ee5775fed7967
path \ path \
0 crawl-data/CC-MAIN-2016-44/segments/1476988720... 0 crawl-data/CC-MAIN-2016-44/segments/1476988720...
1 crawl-data/CC-MAIN-2014-41/segments/1410657132... 1 crawl-data/CC-MAIN-2014-41/segments/1410657132...
2 crawl-data/CC-MAIN-2013-20/segments/1368702127... 2 crawl-data/CC-MAIN-2013-20/segments/1368702127...
3 crawl-data/CC-MAIN-2014-41/segments/1410657120... 3 crawl-data/CC-MAIN-2014-41/segments/1410657120...
4 crawl-data/CC-MAIN-2013-20/segments/1368696381... 4 crawl-data/CC-MAIN-2013-20/segments/1368696381...
... ... ... ...
112458 crawl-data/CC-MAIN-2018-43/segments/1539583519... 112458 crawl-data/CC-MAIN-2018-43/segments/1539583519...
112459 crawl-data/CC-MAIN-2018-43/segments/1539583519... 112459 crawl-data/CC-MAIN-2018-43/segments/1539583519...
112460 crawl-data/CC-MAIN-2018-43/segments/1539583519... 112460 crawl-data/CC-MAIN-2018-43/segments/1539583519...
112461 crawl-data/CC-MAIN-2018-43/segments/1539583519... 112461 crawl-data/CC-MAIN-2018-43/segments/1539583519...
112462 crawl-data/CC-MAIN-2018-43/segments/1539583519... 112462 crawl-data/CC-MAIN-2018-43/segments/1539583519...
raw \ raw \
0 Edmund J. Bourne, PhD, is a psychologist in no... 0 Edmund J. Bourne, PhD, is a psychologist in no...
1 Abigail Mackey is a registered nurse. For more... 1 Abigail Mackey is a registered nurse. For more...
2 Dr. Constance Milbrath is a developmental psyc... 2 Dr. Constance Milbrath is a developmental psyc...
3 Dr. Andrew Gottlieb is a clinical psychologist... 3 Dr. Andrew Gottlieb is a clinical psychologist...
4 Milton Wolf is a physician practicing in Kansa... 4 Milton Wolf is a physician practicing in Kansa...
... ... ... ...
112458 Adrienne Lewis Adrienne is a registered nurse ... 112458 Adrienne Lewis Adrienne is a registered nurse ...
112459 Eric Haralson, PA-C is a physician assistant i... 112459 Eric Haralson, PA-C is a physician assistant i...
112460 Alice Sumo is a respected nurse in Liberia, wh... 112460 Alice Sumo is a respected nurse in Liberia, wh...
112461 Rachel Kelley Schulman, MS, PA-C is a board-ce... 112461 Rachel Kelley Schulman, MS, PA-C is a board-ce...
112462 Victor N. Hakim, MD is a practicing Orthopedic... 112462 Victor N. Hakim, MD is a practicing Orthopedic...
name raw_title gender start_pos \ name raw_title gender start_pos \
0 [Edmund, J, Bourne] psychologist M 136 0 [Edmund, J, Bourne] psychologist M 136
1 [Abigail, , Mackey] nurse F 37 1 [Abigail, , Mackey] nurse F 37
2 [Constance, , Milbrath] psychologist F 305 2 [Constance, , Milbrath] psychologist F 305
3 [Andrew, , Gottlieb] psychologist M 72 3 [Andrew, , Gottlieb] psychologist M 72
4 [Milton, , Wolf] physician M 107 4 [Milton, , Wolf] physician M 107
... ... ... ... ... ... ... ... ... ...
112458 [Adrienne, Lewis, Adrienne] nurse F 118 112458 [Adrienne, Lewis, Adrienne] nurse F 118
112459 [Eric, , Haralson] physician M 98 112459 [Eric, , Haralson] physician M 98
112460 [Alice, , Sumo] nurse F 98 112460 [Alice, , Sumo] nurse F 98
112461 [Rachel, Kelley, Schulman] physician F 74 112461 [Rachel, Kelley, Schulman] physician F 74
112462 [Victor, N, Hakim] Orthopedic Surgeon M 72 112462 [Victor, N, Hakim] Orthopedic Surgeon M 72
title URI \ title URI \
0 psychologist http://www.alibris.co.uk/search/books/author/E... 0 psychologist http://www.alibris.co.uk/search/books/author/E...
1 nurse http://observer-reporter.com/article/20130315/... 1 nurse http://observer-reporter.com/article/20130315/...
2 psychologist http://earlylearning.ubc.ca/people/ 2 psychologist http://earlylearning.ubc.ca/people/
3 psychologist http://www.psychologylounge.com/tag/sexuality-2/ 3 psychologist http://www.psychologylounge.com/tag/sexuality-2/
4 physician http://hotair.com/archives/2011/12/13/romney-i... 4 physician http://hotair.com/archives/2011/12/13/romney-i...
... ... ... ... ... ...
112458 nurse https://www.adansw.com.au/CPD/Courses/Geriatri... 112458 nurse https://www.adansw.com.au/CPD/Courses/Geriatri...
112459 physician https://www.healthgrades.com/providers/eric-ha... 112459 physician https://www.healthgrades.com/providers/eric-ha...
112460 nurse http://woman.ng/2018/10/women-love-midwife-ali... 112460 nurse http://woman.ng/2018/10/women-love-midwife-ali...
112461 physician https://lincolnparkaesthetics.com/ourstaff/ 112461 physician https://lincolnparkaesthetics.com/ourstaff/
112462 surgeon https://www.sharecare.com/doctor/dr-victor-n-h... 112462 surgeon https://www.sharecare.com/doctor/dr-victor-n-h...
bio bio
0 _ is author of several books, including the be... 0 _ is author of several books, including the be...
1 For more quips and tips, refer to _ blog, “The... 1 For more quips and tips, refer to _ blog, “The...
2 _ interests at HELP are in the ethno-cultural ... 2 _ interests at HELP are in the ethno-cultural ...
3 _ practice serves the greater Silicon Valley a... 3 _ practice serves the greater Silicon Valley a...
4 During the health care debates of 2010, Dr. _ ... 4 During the health care debates of 2010, Dr. _ ...
... ... ... ...
112458 _ has been successful in gaining two nationall... 112458 _ has been successful in gaining two nationall...
112459 _ graduated from Touro Center / College Of Ost... 112459 _ graduated from Touro Center / College Of Ost...
112460 In _ three-decade career, _ has seen two civil... 112460 In _ three-decade career, _ has seen two civil...
112461 _ has a Master of Science degree in Physician ... 112461 _ has a Master of Science degree in Physician ...
112462 _ completed a residency at Henry Ford Hospital... 112462 _ completed a residency at Henry Ford Hospital...
[112463 rows x 10 columns] [112463 rows x 10 columns]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## TPR Graph mid ppt ## TPR Graph mid ppt
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
classes = ['physician','nurse','psychologist','dentist','surgeon','dietitian','chiropractor'] classes = ['physician','nurse','psychologist','dentist','surgeon','dietitian','chiropractor']
tpr_males = [0.864, 0.8128654970760234, 0.8922631959508315, 0.949435180204411, 0.6972111553784861, 0.6851851851851852, 0.701058201058201] tpr_males = [0.864, 0.8128654970760234, 0.8922631959508315, 0.949435180204411, 0.6972111553784861, 0.6851851851851852, 0.701058201058201]
tpr_females = [0.9056320400500626, 0.8622613803230543, 0.8844028899277518, 0.9446064139941691, 0.6285714285714286, 0.8905608755129959, 0.6625] tpr_females = [0.9056320400500626, 0.8622613803230543, 0.8844028899277518, 0.9446064139941691, 0.6285714285714286, 0.8905608755129959, 0.6625]
count_males = [4125, 342, 1383, 1859, 2259, 54, 378] count_males = [4125, 342, 1383, 1859, 2259, 54, 378]
count_females = [3995, 3405, 2353, 1029, 420, 731, 160 ] count_females = [3995, 3405, 2353, 1029, 420, 731, 160 ]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
x = [count_males[i]/(count_males[i]+count_females[i]) for i in range(7)] x = [count_males[i]/(count_males[i]+count_females[i]) for i in range(7)]
y = [tpr_males[i]-tpr_females[i] for i in range(7)] y = [tpr_males[i]-tpr_females[i] for i in range(7)]
plt.scatter(x, y) plt.scatter(x, y)
plt.xlabel("% Male") plt.xlabel("% Male")
plt.ylabel("TPR Gender Gap Male") plt.ylabel("TPR Gender Gap Male")
for i, txt in enumerate(classes): for i, txt in enumerate(classes):
plt.annotate(txt, (x[i], y[i])) plt.annotate(txt, (x[i], y[i]))
plt.show() plt.show()
``` ```
%%%% Output: display_data %%%% Output: display_data
![]() ![]()
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
x = [count_females[i]/(count_males[i]+count_females[i]) for i in range(7)] x = [count_females[i]/(count_males[i]+count_females[i]) for i in range(7)]
y = [tpr_females[i]-tpr_males[i] for i in range(7)] y = [tpr_females[i]-tpr_males[i] for i in range(7)]
plt.scatter(x, y) plt.scatter(x, y)
plt.xlabel("% Female") plt.xlabel("% Female")
plt.ylabel("TPR Gender Gap Female") plt.ylabel("TPR Gender Gap Female")
for i, txt in enumerate(classes): for i, txt in enumerate(classes):
plt.annotate(txt, (x[i], y[i])) plt.annotate(txt, (x[i], y[i]))
plt.show() plt.show()
``` ```
%%%% Output: display_data %%%% Output: display_data
![]() ![]()
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# plt.bar([[i,count_males[i]] for i in range(7)], height=count_males) # plt.bar([[i,count_males[i]] for i in range(7)], height=count_males)
# plt.bar([i for i in range(7)], height=count_females) # plt.bar([i for i in range(7)], height=count_females)
import numpy as np import numpy as np
m = [[i,count_males[i]] for i in range(7)] m = [[i,count_males[i]] for i in range(7)]
f = [[i,count_females[i]] for i in range(7)] f = [[i,count_females[i]] for i in range(7)]
# list1 = [[0,1],[1,2.5],[2,3],[3,5.6]] # list1 = [[0,1],[1,2.5],[2,3],[3,5.6]]
# list2 = [[0,2],[2,5],[3,7]] # list2 = [[0,2],[2,5],[3,7]]
x1,y1 = zip(*m) x1,y1 = zip(*m)
x2,y2 = zip(*f) x2,y2 = zip(*f)
plt.figure(figsize=(10, 5)) plt.figure(figsize=(10, 5))
plt.bar(np.array(x1)-0.15, y1, width = 0.3, label ='males') plt.bar(np.array(x1)-0.15, y1, width = 0.3, label ='males')
plt.bar(np.array(x2)+0.15, y2, width = 0.3, label ='females') plt.bar(np.array(x2)+0.15, y2, width = 0.3, label ='females')
#setting the xticks. Note x1 and x2 are tuples, thus + is concatenation #setting the xticks. Note x1 and x2 are tuples, thus + is concatenation
# plt.xticks(range(min(x1+x2), max(x1+x2)+1)) # plt.xticks(range(min(x1+x2), max(x1+x2)+1))
plt.xticks([i for i in range(7)], classes) plt.xticks([i for i in range(7)], classes)
plt.legend() plt.legend()
plt.show() plt.show()
``` ```
%%%% Output: display_data %%%% Output: display_data
![]() ![]()
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Evaluations Final PPT # Evaluations Final PPT
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from joblib import load from joblib import load
from config import CLASS_GROUP, MASKED, EVALUATION_SCORES from config import CLASS_GROUP, MASKED, EVALUATION_SCORES
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
def tpr_gender_gap(scores,class_group='medical'): def tpr_gender_gap(scores,class_group='medical'):
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))] x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_males = [scores['tpr_males'][i]-scores['tpr_females'][i] for i in range(len(CLASS_GROUP[class_group]))] y_males = [scores['tpr_males'][i]-scores['tpr_females'][i] for i in range(len(CLASS_GROUP[class_group]))]
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))] x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_females = [scores['tpr_females'][i]-scores['tpr_males'][i] for i in range(len(CLASS_GROUP[class_group]))] y_females = [scores['tpr_females'][i]-scores['tpr_males'][i] for i in range(len(CLASS_GROUP[class_group]))]
return (x_males,y_males,x_females,y_females) return (x_males,y_males,x_females,y_females)
def average_odds_difference(scores, class_group='medical'): def average_odds_difference(scores, class_group='medical'):
x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))] x_males = [scores['count_males'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_males = [(scores['fpr_males'][i]-scores['fpr_females'][i] + scores['tpr_males'][i]-scores['tpr_females'][i])/2 for i in range(len(CLASS_GROUP[class_group]))] y_males = [(scores['fpr_males'][i]-scores['fpr_females'][i] + scores['tpr_males'][i]-scores['tpr_females'][i])/2 for i in range(len(CLASS_GROUP[class_group]))]
x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))] x_females = [scores['count_females'][i]/(scores['count_males'][i]+scores['count_females'][i]) for i in range(len(CLASS_GROUP[class_group]))]
y_females = [(scores['fpr_females'][i]-scores['fpr_males'][i] + scores['tpr_females'][i]-scores['tpr_males'][i])/2 for i in range(len(CLASS_GROUP[class_group]))] y_females = [(scores['fpr_females'][i]-scores['fpr_males'][i] + scores['tpr_females'][i]-scores['tpr_males'][i])/2 for i in range(len(CLASS_GROUP[class_group]))]
return (x_males,y_males,x_females,y_females) return (x_males,y_males,x_females,y_females)
def average_odds_error(scores, class_group='medical'): def average_odds_error(scores, class_group='medical'):