This repository has been archived by the owner on Apr 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_maker.py
288 lines (267 loc) · 14.3 KB
/
decision_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""Copyright © 2020-present, Swisscom (Schweiz) AG.
All rights reserved.
DecisionMaker, used for modeling the interaction of decision maker in the sourcing process.
The DecisionMaker class contains the implementation of the different interactions.
Its function is to evaluate results with a specific subpopulation, i.e. team.
"""
from itertools import product
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from .person import Person
class DecisionMaker(Person):
"""DecisionMaker class.
The decide is used to compute the fairness of a decision maker.
"""
def __init__(self, pers_id: str, team_id: str, decision_name: str, attributes: dict,
min_candidates_bucket=10):
""" Inits DecisionMaker with its pers_id, team_id, decision_name, attributes, min_candidates_bucket
Args:
pers_id: A string that uniquely identifies this employee
team_id: A string that uniquely identifies the team the employee belongs to
decision_name: A string that indicates the binary column containing the decision made by this actor
attributes: A Python dict that contains the criteria and categories the DecisionMaker belongs to
min_candidates_bucket: An integer that is the minimum number of candidates necessary to visualise
on the landscape"""
if not isinstance(min_candidates_bucket, int):
raise TypeError('Argument: the min_candidates_bucket must be an integer')
if min_candidates_bucket <= 0:
raise ValueError('Argument: the min_candidates_bucket must be a positive integer')
if len(decision_name) == 0:
raise ValueError('Argument: the decision_name has length 0.')
super().__init__(pers_id, team_id, attributes)
self._decision = decision_name
self._min_candidates = min_candidates_bucket
def _compute_landscape(self, data: pd.DataFrame) -> pd.DataFrame:
""" A function that groups all the employees and returns the necessary counts for the landscape
Args:
data: a Pandas DataFrame with a row per candidate
Returns:
A Pandas DataFrame that has the groups of candidates per demographic subpopulation, with their probability
of positive decision
"""
if len(data) == 0:
raise ValueError('Argument : data is empty')
if not isinstance(data, pd.DataFrame):
raise TypeError('Argument : data is not a Pandas DataFrame')
if 'age' not in data.columns:
raise ValueError('Argument : data is missing age column.')
if 'gender' not in data.columns:
raise ValueError('Argument : data is missing gender column.')
if 'language' not in data.columns:
raise ValueError('Argument : data is missing language column.')
if self._decision not in data.columns:
raise ValueError('Argument : data is missing ', self._decision, ' column.')
# Get all the combinations of profile possible
products = product(sorted(data['age'].unique()), data['gender'].unique(),
data['language'].unique())
# Prepare to store the metrics for each profile
landscape = pd.DataFrame(columns=['age', 'gender', 'language', 'total', 'pos_count', 'probability'])
# Go over all the profiles of age, gender and language
for a, g, l in products:
# Extract the temporary sub-dataframe
curr_df = data[(data['age'] == a) & (data['gender'] == g) & (data['language'] == l)]
# Number of candidates is the length
total = len(curr_df)
if total != 0 and sum(curr_df[self._decision].values) > 0:
# If more than 0, count the number of positive decisions
pos_count = curr_df[self._decision].value_counts()[1]
# Compute the probability
prob = pos_count / total
else:
pos_count = 0
prob = 0
# Store the data
landscape.loc[len(landscape)] = [a, g, l, float(total), pos_count, prob]
return landscape
def _distance_candidate(self, cand_attributes: dict) -> int:
""" Function that returns how many criteria differ between the candidate and the DecisionMaker
Args:
cand_attributes: A Python dict with the categories of each criteria that the candidate belongs to
Returns:
An integer that represent the number of different attributes the DecisionMaker has from the given candidate
"""
if len(cand_attributes) == 0:
raise ValueError('Argument: cand_attributes are empty')
if len(self._attributes.keys()) != len(cand_attributes.keys()):
raise ValueError(
'Argument: cand_attributes does not have the same number of the attributes of Decision Maker')
if any([attr not in cand_attributes.keys() for attr in self._attributes.keys()]):
raise ValueError('Argument: cand_attributes does not match the attributes of Decision Maker')
# Initialize the distance to 0
d = 0
# For each attribute,
for attr in self._attributes.keys():
# Compare the DecisionMaker's value to the Candidate's
if self._attributes[attr] != cand_attributes[attr]:
# If different increment the distance
d += 1
return d
@staticmethod
def _prule(distance_probs: dict) -> float:
""" A function that computes the fairness value of a DecisionMaker's past decisions
Args:
distance_probs: a dict that given a distance with the recruiter returns the probability of positive decision
Returns:
A float that is the fairness value of this DecisionMaker
"""
if len(distance_probs) == 0:
raise ValueError('Argument: distance_probs is empty')
if any([p < 0 for p in distance_probs.values()]):
raise ValueError('Argument: distance_probs has negative values')
if any([p > 1 for p in distance_probs.values()]):
raise ValueError('Argument: distance_probs has values larger than 1')
# Extract the min and max probabilities
max_p = max(distance_probs.values())
if max_p == 0:
raise ValueError('Argument: distance_probs has max probability 0')
# Compute the ratio of min over max
return np.round(min(distance_probs.values()) / max_p, 4)
def get_fairness(self, data: pd.DataFrame) -> (float, dict):
""" A function that returns the Fairness value of this DecisionMaker given their history and their probability
of positive decisions given the distance to the candidate.
Args:
data: a Pandas DataFrame that has one row per candidate and their decision
Returns:
A tuple of a float for the fairness value and a dict for the probabilities given the distances
"""
if len(data) == 0:
raise ValueError('Argument : data is empty')
if not isinstance(data, pd.DataFrame):
raise TypeError('Argument : data is not a Pandas DataFrame')
# Get the grouped data
landscape = self._compute_landscape(data)
# Initialize the counts and probabilities for each distance
distance_counts = {0: 0, 1: 0, 2: 0, 3: 0}
distance_probs = {0: 0, 1: 0, 2: 0, 3: 0}
for i, r in landscape.iterrows():
# Group the candidates according to their distance to the DecisionMaker
d = self._distance_candidate(r[list(self._attributes.keys())].to_dict())
distance_counts[d] += r['total']
distance_probs[d] += r['pos_count']
# Ensure that we consider the only the groups that have enough candidates
for d in [0, 1, 2, 3]:
if distance_counts[d] >= self._min_candidates:
distance_probs[d] /= distance_counts[d]
else:
distance_probs.pop(d, None)
# Compute the p-rule
return self._prule(distance_probs), distance_probs
def visualise_fairness(self, data: pd.DataFrame, distance_probs: dict, fig_size=(5, 5), save_fig=None,
dark_theme=True):
""" A function that displays the different probabilities to visualise the fairness level
Args:
data: a Pandas DataFrame with the decision history of this DecisionMaker
distance_probs: a Python dict with the probabilities of positive decisions given a distance
fig_size: a tuple indicating the size of the plot, default (5, 5)
save_fig: a file_name where to save the plot, default None (not saving)
dark_theme: a boolean indicating if we want a lighter background, default True
"""
if len(data) == 0:
raise ValueError('Argument : data is empty')
if not isinstance(data, pd.DataFrame):
raise TypeError('Argument : data is not a Pandas DataFrame')
if len(distance_probs) == 0:
raise ValueError('Argument: distance_probs is empty')
# Compute the counts of decisions
counts = data[self._decision].value_counts(dropna=False)
# Compute the overall probability
avg_prob = counts[1] / (counts[1] + counts[0])
# Prepare the figure
_ = plt.figure(figsize=fig_size)
if dark_theme:
sns.set(rc={'figure.facecolor': 'lightgrey'})
else:
sns.set(rc={'figure.facecolor': 'none'})
# Plot the data
plt.axhline(y=avg_prob, color='black', ls='--', linewidth=1, alpha=0.9)
sns.barplot(list(distance_probs.keys()), list(distance_probs.values()))
plt.title("Distance distribution for given recruiter")
plt.ylim((0, 1))
# Saving figure if needed
if save_fig is not None:
fig = plt.gcf()
fig.tight_layout()
fig.savefig(save_fig, bbox_inches='tight')
plt.show()
def visualise_landscape(self, data: pd.DataFrame, fig_size=(25, 10), save_fig=None, dark_theme=True):
""" A function that visualises in 5D the decision patterns of the given DecisionMaker
Criteria -- You have one dimension across the plot, the second for the lines, the third with the columns.
Decisions -- The size of the bubble shows how many candidates were in that group and the color is the
probability of positive decision
The green star locates the attributes of the recruiter.
Args:
data: a Pandas DataFrame with the decision history of this DecisionMaker
fig_size: a tuple indicating the size of the plot, default (5, 5)
save_fig: a file_name where to save the plot, default None (not saving)
dark_theme: a boolean indicating if we want a lighter background, default True
"""
if len(data) == 0:
raise ValueError('Argument : data is empty')
if not isinstance(data, pd.DataFrame):
raise TypeError('Argument : data is not a Pandas DataFrame')
# Compute the landscape information
landscape = self._compute_landscape(data)
# Prepare the figure
fig, axes = plt.subplots(1, 3, figsize=fig_size)
if dark_theme:
fig.patch.set_color('lightgrey')
else:
fig.patch.set_color('none')
# For each sub-dimensions, plot the grid
for i, g in enumerate(['M', 'F', 'O']):
curr_df = landscape[landscape['gender'] == g]
# Plot landscape
plt.rc('axes', axisbelow=True)
axes[i].grid(color='grey', linestyle='-', linewidth=1, alpha=0.4)
axes[i].scatter(curr_df['language'], curr_df['age'], s=curr_df['total'] * 10, c=curr_df['probability'],
cmap="YlOrBr", alpha=1, edgecolors="grey", linewidth=0.5)
axes[i].set_yticks(curr_df['age'].unique())
axes[i].set_title('Candidate Gender : ' + g)
axes[i].set_xlabel("Candidate Language")
axes[i].set_ylabel("Candidate Age")
# Plot balloon size legend
legend2_line2d = list()
legend2_line2d.append(mlines.Line2D([0], [0],
linestyle='none',
marker='o',
alpha=1,
markersize=np.sqrt(10),
markerfacecolor='none',
markeredgecolor='black'))
legend2_line2d.append(mlines.Line2D([0], [0],
linestyle='none',
marker='o',
alpha=1,
markersize=np.sqrt(100),
markerfacecolor='none',
markeredgecolor='black'))
legend2_line2d.append(mlines.Line2D([0], [0],
linestyle='none',
marker='o',
alpha=1,
markersize=np.sqrt(1000),
markerfacecolor='none',
markeredgecolor='black'))
_ = plt.legend(legend2_line2d,
['1', '10', '100'],
title='Total Candidates',
numpoints=1,
fontsize=10,
bbox_to_anchor=(1., 0.8), # loc='best',
frameon=False,
labelspacing=3,
handlelength=5,
borderpad=4
)
# Add the information about the DecisionMaker
g_idx = ['M', 'F', 'O'].index(self._attributes['gender'])
axes[g_idx].scatter(self._attributes['language'], self._attributes['age'], s=150, c='green', marker="*")
# Saving the figure if needed
if save_fig is not None:
fig = plt.gcf()
fig.tight_layout()
fig.savefig(save_fig, bbox_inches='tight')
plt.show()