diff --git a/check_adult_anonymity.py b/check_adult_anonymity.py index 62471fd6b185fd8d92fa0886f0ad53f6320975b8..e538714fbe040223c7bdbf42a079ea35ca1b8173 100644 --- a/check_adult_anonymity.py +++ b/check_adult_anonymity.py @@ -8,7 +8,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): k_anon = test_anonymity.calculate_k(file_name, quasi_ident) l_div = test_anonymity.calculate_l(file_name, quasi_ident, sens_att) entropy_l = test_anonymity.calculate_entropy_l(file_name, quasi_ident, sens_att) - alpha, _ = test_anonymity.get_alpha_k(file_name, quasi_ident, sens_att) + alpha, _ = test_anonymity.calculate_alpha_k(file_name, quasi_ident, sens_att) basic_beta = test_anonymity.calculate_basic_beta(file_name, quasi_ident, sens_att) enhanced_beta = test_anonymity.calculate_enhanced_beta(file_name, quasi_ident, sens_att) delta_disclosure = test_anonymity.calculate_delta_disclosure(file_name, quasi_ident, sens_att) @@ -36,7 +36,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): max_l = min(max_l) assert l_new <= max_l, f'Error, the maximum value for l is {max_l}' - df_new = test_anonymity.l_diversity(file_name, quasi_ident, sens_att, l_new) + df_new = test_anonymity.achieve_l_diversity(file_name, quasi_ident, sens_att, l_new) if len(df_new) > l_new: df_new.to_csv(new_file_name, index = False) print(f'Dataset veryfying l-diversity with l = {l_new} saved in: {new_file_name}.\n') diff --git a/check_airline_data_anonymity_gen.py b/check_airline_data_anonymity_gen.py index 8c4fc92458332a2dbede1053dca17fb4cfb90418..c8a07e558637cfc97d690955302324e26b9262ea 100644 --- a/check_airline_data_anonymity_gen.py +++ b/check_airline_data_anonymity_gen.py @@ -8,7 +8,7 @@ def check_anonymity(file_name, quasi_ident, sens_att): k_anon = test_anonymity.calculate_k(file_name, quasi_ident) l_div = test_anonymity.calculate_l(file_name, quasi_ident, sens_att) entropy_l = test_anonymity.calculate_entropy_l(file_name, quasi_ident, sens_att) - alpha, _ = test_anonymity.get_alpha_k(file_name, quasi_ident, sens_att) + alpha, _ = test_anonymity.calculate_alpha_k(file_name, quasi_ident, sens_att) basic_beta = test_anonymity.calculate_basic_beta(file_name, quasi_ident, sens_att) enhanced_beta = test_anonymity.calculate_enhanced_beta(file_name, quasi_ident, sens_att) delta_disclosure = test_anonymity.calculate_delta_disclosure(file_name, quasi_ident, sens_att) diff --git a/check_anonymity/test_anonymity.py b/check_anonymity/test_anonymity.py index 393854881bc2b97bc00f0ea49096e27e47c411d4..08ac106523e4ae67765884f9bea6f05007635367 100644 --- a/check_anonymity/test_anonymity.py +++ b/check_anonymity/test_anonymity.py @@ -1,5 +1,5 @@ """Module with different functions which calculate properties about anonymity: -k.anonimity, (alpha,k)-anonymity, l-diversity, entropy l-diversity, (c,l)-diversity, +k-anonimity, (alpha,k)-anonymity, l-diversity, entropy l-diversity, (c,l)-diversity, basic beta-likeness, enhanced beta-likeness, t-closeness and delta-disclosure privacy.""" import os @@ -109,6 +109,10 @@ def calculate_k(file_name, quasi_ident): Parameter quasi_ident: list with the name of the columns of the dataframe that are quasi-identifiers. Precondition: quasi_ident is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ data = read_file(file_name) check_qi(data, quasi_ident) @@ -116,21 +120,6 @@ def calculate_k(file_name, quasi_ident): k_anon = min([len(x) for x in equiv_class]) return k_anon -def aux_calculate_k(data, quasi_ident): - """Calculate k for k-anonymity. - - Parameter data: dataframe with the data under study. - Precondition: data is a pandas dataframe. - - Parameter quasi_ident: list with the name of the columns of the dataframe - that are quasi-identifiers. - Precondition: quasi_ident is a list of strings. - """ - check_qi(data, quasi_ident) - equiv_class = get_equiv_class(data, quasi_ident) - k_anon = min([len(x) for x in equiv_class]) - return k_anon - def convert(set_): """Converts a set to a list. @@ -152,6 +141,10 @@ def calculate_l(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) sens_att = np.array(sens_att) @@ -176,7 +169,7 @@ def calculate_l(file_name, quasi_ident, sens_att, gen = True): l_div.append(min(l_ec)) return min(l_div) -def l_diversity(file_name, quasi_ident, sens_att, l_new): +def achieve_l_diversity(file_name, quasi_ident, sens_att, l_new): """Given l, transform the dataset into a new one checking l-diversity for the new l, only using suppression. @@ -224,6 +217,10 @@ def calculate_entropy_l(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -275,6 +272,10 @@ def calculate_c_l_diversity(file_name, quasi_ident, sens_att, imp = 0, gen = Tru Parameter imp: impression level. Precondition: imp is an int, imp = 1 if comments need to be displayed. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -313,7 +314,7 @@ def calculate_c_l_diversity(file_name, quasi_ident, sens_att, imp = 0, gen = Tru return c_div, l_div -def get_alpha_k(file_name, quasi_ident, sens_att, gen = True): +def calculate_alpha_k(file_name, quasi_ident, sens_att, gen = True): """Calculate alpha and k for (alpha,k)-anonymity. Parameter file_name: name of the file with the data under study. @@ -326,6 +327,10 @@ def get_alpha_k(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -358,7 +363,7 @@ def get_alpha_k(file_name, quasi_ident, sens_att, gen = True): return alpha, k_anon -def aux_calculate_beta(data, quasi_ident, sens_att_value): +def __aux_calculate_beta(data, quasi_ident, sens_att_value): """Auxiliary function for beta calculation for basic and enhanced beta-likeness. Parameter data: dataframe with the data under study. @@ -396,6 +401,10 @@ def calculate_basic_beta(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -404,12 +413,12 @@ def calculate_basic_beta(file_name, quasi_ident, sens_att, gen = True): beta_sens_att = [] if gen: for sens_att_value in sens_att: - _, dist = aux_calculate_beta(data, quasi_ident, sens_att_value) + _, dist = __aux_calculate_beta(data, quasi_ident, sens_att_value) beta_sens_att.append(max(dist)) else: for i, sens_att_value in enumerate(sens_att): tmp_qi = np.concatenate([quasi_ident, np.delete(sens_att, i)]) - _, dist = aux_calculate_beta(data, tmp_qi, sens_att_value) + _, dist = __aux_calculate_beta(data, tmp_qi, sens_att_value) beta_sens_att.append(max(dist)) beta = max(beta_sens_att) return beta @@ -427,6 +436,10 @@ def calculate_enhanced_beta(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -435,19 +448,19 @@ def calculate_enhanced_beta(file_name, quasi_ident, sens_att, gen = True): beta_sens_att = [] if gen: for sens_att_value in sens_att: - p, dist = aux_calculate_beta(data, quasi_ident, sens_att_value) + p, dist = __aux_calculate_beta(data, quasi_ident, sens_att_value) min_beta_lnp = [min(max(dist), -np.log(p_i)) for p_i in p] beta_sens_att.append(max(min_beta_lnp)) else: for i, sens_att_value in enumerate(sens_att): tmp_qi = np.concatenate([quasi_ident, np.delete(sens_att, i)]) - p, dist = aux_calculate_beta(data, tmp_qi, sens_att_value) + p, dist = __aux_calculate_beta(data, tmp_qi, sens_att_value) min_beta_lnp = [min(max(dist), -np.log(p_i)) for p_i in p] beta_sens_att.append(max(min_beta_lnp)) beta = max(beta_sens_att) return beta -def aux_calculate_delta_disclosure(data, quasi_ident, sens_att_value): +def __aux_calculate_delta_disclosure(data, quasi_ident, sens_att_value): """Auxiliary function for delta calculation for delta-disclousure privacy. Parameter data: dataframe with the data under study. @@ -460,6 +473,10 @@ def aux_calculate_delta_disclosure(data, quasi_ident, sens_att_value): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ equiv_class = get_equiv_class(data, quasi_ident) values = np.unique(data[sens_att_value].values) @@ -485,6 +502,10 @@ def calculate_delta_disclosure(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -493,17 +514,17 @@ def calculate_delta_disclosure(file_name, quasi_ident, sens_att, gen = True): delta_sens_att = [] if gen: for sens_att_value in sens_att: - aux = aux_calculate_delta_disclosure(data, quasi_ident, sens_att_value) + aux = __aux_calculate_delta_disclosure(data, quasi_ident, sens_att_value) delta_sens_att.append(max(aux)) else: for i, sens_att_value in enumerate(sens_att): tmp_qi = np.concatenate([quasi_ident, np.delete(sens_att, i)]) - aux = aux_calculate_delta_disclosure(data, tmp_qi, sens_att_value) + aux = __aux_calculate_delta_disclosure(data, tmp_qi, sens_att_value) delta_sens_att.append(max(aux)) delta = max(delta_sens_att) return delta -def aux_t_closeness_num(data, quasi_ident, sens_att_value): +def __aux_t_closeness_num(data, quasi_ident, sens_att_value): """Auxiliary function for t calculation for t-closeness. Function used for numerical attributes: the definition of the EMD is used. @@ -535,7 +556,7 @@ def aux_t_closeness_num(data, quasi_ident, sens_att_value): emd.append(emd_ec) return max(emd) -def aux_t_closeness_str(data, quasi_ident, sens_att_value): +def __aux_t_closeness_str(data, quasi_ident, sens_att_value): """Auxiliary function for t calculation for t-closeness. Function used for categorical attributes: the metric "Equal Distance" is used. @@ -579,6 +600,10 @@ def calculate_t_closeness(file_name, quasi_ident, sens_att, gen = True): Parameter sens_att: list with the name of the columns of the dataframe that are the sensitive attributes. Precondition: sens_att is a list of strings. + + Parameter gen: boolean, if true, it is generalized for the case of multiple + SA, if False, the set of QI is updated for each SA. + Precondition: gen = True (default) or gen = False. """ quasi_ident = np.array(quasi_ident) data = read_file(file_name) @@ -588,19 +613,21 @@ def calculate_t_closeness(file_name, quasi_ident, sens_att, gen = True): if gen: for sens_att_value in sens_att: if pd.api.types.is_numeric_dtype(data[sens_att_value]): - t_sens_att.append(aux_t_closeness_num(data, quasi_ident, sens_att_value)) + t_sens_att.append(__aux_t_closeness_num(data, quasi_ident, sens_att_value)) elif pd.api.types.is_string_dtype(data[sens_att_value]): - t_sens_att.append(aux_t_closeness_str(data, quasi_ident, sens_att_value)) + t_sens_att.append(__aux_t_closeness_str(data, quasi_ident, sens_att_value)) else: raise ValueError('Error, invalid sens_att value type') else: for i, sens_att_value in enumerate(sens_att): if pd.api.types.is_numeric_dtype(data[sens_att_value]): tmp_qi = np.concatenate([quasi_ident, np.delete(sens_att, i)]) - t_sens_att.append(aux_t_closeness_num(data, tmp_qi, sens_att_value)) + t_sens_att.append(__aux_t_closeness_num(data, tmp_qi, sens_att_value)) elif pd.api.types.is_string_dtype(data[sens_att_value]): tmp_qi = np.concatenate([quasi_ident, np.delete(sens_att, i)]) - t_sens_att.append(aux_t_closeness_str(data, tmp_qi, sens_att_value)) + t_sens_att.append(__aux_t_closeness_str(data, tmp_qi, sens_att_value)) else: raise ValueError('Error, invalid sens_att value type') return max(t_sens_att) + + diff --git a/check_drug_data_anonymity.py b/check_drug_data_anonymity.py index 32c36b3fe9d664608f173d74ca3d5d81b0c0b000..d81e08eeace6b1f70fb8450fe7fbb54bb22f26b5 100644 --- a/check_drug_data_anonymity.py +++ b/check_drug_data_anonymity.py @@ -8,7 +8,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): k_anon = test_anonymity.calculate_k(file_name, quasi_ident) l_div = test_anonymity.calculate_l(file_name, quasi_ident, sens_att) entropy_l = test_anonymity.calculate_entropy_l(file_name, quasi_ident, sens_att) - alpha, _ = test_anonymity.get_alpha_k(file_name, quasi_ident, sens_att) + alpha, _ = test_anonymity.calculate_alpha_k(file_name, quasi_ident, sens_att) basic_beta = test_anonymity.calculate_basic_beta(file_name, quasi_ident, sens_att) enhanced_beta = test_anonymity.calculate_enhanced_beta(file_name, quasi_ident, sens_att) delta_disclosure = test_anonymity.calculate_delta_disclosure(file_name, quasi_ident, sens_att) @@ -36,7 +36,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): max_l = min(max_l) assert l_new <= max_l, f'Error, the maximum value for l is {max_l}' - df_new = test_anonymity.l_diversity(file_name, quasi_ident, sens_att, l_new) + df_new = test_anonymity.achieve_l_diversity(file_name, quasi_ident, sens_att, l_new) if len(df_new) > l_new: df_new.to_csv(new_file_name, index = False) print(f'Dataset veryfying l-diversity with l = {l_new} saved in: {new_file_name}.\n') diff --git a/check_math_scores_anonymity.py b/check_math_scores_anonymity.py index c938e76f47eeac47ac3a8f8ce5aa903477aaff2e..b807c189c1e04f532e451dd537c0aca7dddb43d0 100644 --- a/check_math_scores_anonymity.py +++ b/check_math_scores_anonymity.py @@ -8,7 +8,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): k_anon = test_anonymity.calculate_k(file_name, quasi_ident) l_div = test_anonymity.calculate_l(file_name, quasi_ident, sens_att) entropy_l = test_anonymity.calculate_entropy_l(file_name, quasi_ident, sens_att) - alpha, _ = test_anonymity.get_alpha_k(file_name, quasi_ident, sens_att) + alpha, _ = test_anonymity.calculate_alpha_k(file_name, quasi_ident, sens_att) basic_beta = test_anonymity.calculate_basic_beta(file_name, quasi_ident, sens_att) enhanced_beta = test_anonymity.calculate_enhanced_beta(file_name, quasi_ident, sens_att) delta_disclosure = test_anonymity.calculate_delta_disclosure(file_name, quasi_ident, sens_att) @@ -36,7 +36,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): max_l = min(max_l) assert l_new <= max_l, f'Error, the maximum value for l is {max_l}' - df_new = test_anonymity.l_diversity(file_name, quasi_ident, sens_att, l_new) + df_new = test_anonymity.achieve_l_diversity(file_name, quasi_ident, sens_att, l_new) if len(df_new) > l_new: df_new.to_csv(new_file_name, index = False) print(f'Dataset veryfying l-diversity with l = {l_new} saved in: {new_file_name}.\n') diff --git a/check_stroke_data_anonymity.py b/check_stroke_data_anonymity.py index 839520b19ad2a1ec412517db943f3374b01fe639..f6f09ce735a4752bd6f5a74b8ace4194c5f76e0e 100644 --- a/check_stroke_data_anonymity.py +++ b/check_stroke_data_anonymity.py @@ -8,7 +8,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): k_anon = test_anonymity.calculate_k(file_name, quasi_ident) l_div = test_anonymity.calculate_l(file_name, quasi_ident, sens_att) entropy_l = test_anonymity.calculate_entropy_l(file_name, quasi_ident, sens_att) - alpha, _ = test_anonymity.get_alpha_k(file_name, quasi_ident, sens_att) + alpha, _ = test_anonymity.calculate_alpha_k(file_name, quasi_ident, sens_att) basic_beta = test_anonymity.calculate_basic_beta(file_name, quasi_ident, sens_att) enhanced_beta = test_anonymity.calculate_enhanced_beta(file_name, quasi_ident, sens_att) delta_disclosure = test_anonymity.calculate_delta_disclosure(file_name, quasi_ident, sens_att) @@ -36,7 +36,7 @@ def check_anonymity(file_name, quasi_ident, sens_att, l_new, new_file_name): max_l = min(max_l) assert l_new <= max_l, f'Error, the maximum value for l is {max_l}' - df_new = test_anonymity.l_diversity(file_name, quasi_ident, sens_att, l_new) + df_new = test_anonymity.achieve_l_diversity(file_name, quasi_ident, sens_att, l_new) if len(df_new) > l_new: df_new.to_csv(new_file_name, index = False) print(f'Dataset veryfying l-diversity with l = {l_new} saved in: {new_file_name}.\n')