Sample Requests for Protegrity Anonymization
Modify and use the sample requests provided here for anonymizing your dataset. Use these requests as a template or as a guideline for building the required request.
Tree-based Aggregation for Attributes with k-Anonymity
This sample uses the following attributes:
- Source: Local file system
- Target: Amazon S3 bucket
- Data set: 1 Quasi Identifier
- Suppression: 0.01
- Privacy Model: K-Anonimity with k value as 50
In this example, the data has custom delimiters.
{
"source": {
"type": "File",
"file": {
"name": "samples/adult.csv",
"props": {
"sep": ";"
}
}
},
"attributes": [
{
"name": "age",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Masking Based",
"hierarchyType": "Rule",
"rule": {
"masking": {
"maskOrder": "Right To Left",
"maskChar": "*",
"maxDomainSize": 2
}
}
}
}
],
"privacyModel": {
"k": {
"kValue": 50
}
},
"config": {
"maxSuppression": 0.01
},
"target": {
"type": "File",
"file": {
"name": "s3://<Your-S3-BucketName>/anon-adult-e1.csv",
"props": {
"lineterminator": "\n"
},
"accessOptions": {
"key": "<Your-S3-API Key>",
"secret": "<Your-S3-API Secret>"
}
}
}
}
#import the anonsdk library
import anonsdk as asdk
import pandas as pd
# s3 bucket credentials
s3_key = <AWS_Key>
s3_secret = <AWS_Secret>
#set the source path for anonymization
# dataset path
source_csv_path = "adult.csv"
# create Store Object source_datastore
source_datastore = asdk.FileDataStore(source_csv_path)
#Set the target path for anonymized result
# anonymized file path
target_csv_path = "s3://target/anon-adult-e1.csv"
# create Store Object target_datastore
target_datastore = asdk.FileDataStore(target_csv_path, access_options={"key": s3_key,"secret": s3_secret})
# Create connection Object with Rest API server
conn = asdk.Connection("https://anon.protegrity.com/")
df = pd.read_csv(source_csv_path,sep=";")
df.head()
# create AnonObject with connection, dataframe metadata and source path
anon_object = asdk.AnonElement(conn, df, source_datastore)
# configure masking of string datatype
anon_object["age"] = asdk.Gen_Mask(maskchar="*",maskOrder="R",maxLength=2)
#Configure K-anonymity , suppression in the dataset allowed
anon_object.config.k = asdk.K(50)
anon_object.config['maxSuppression'] = 0.01
# Send Anonymization request with Transformation Configuration with the target store
job = asdk.anonymize(anon_object,target_datastore ,force=True)
# check the status of the job <check the status iteratively until 'status': 'Completed' >
job.status()
# check the comparative risk statistics from the source and result dataset
job.riskStat()
# check the comparative utility statistics from the source and result dataset
job.utilityStat()
Tree-based Aggregation for Attributes with k-Anonymity, l-Diversity, and t-Closeness
This sample uses the following attributes:
- Source: Local file system
- Target: Amazon S3 bucket
- Data set: 4 Quasi Identifiers, 2 Sensitive Attributes
- Suppression: 0.10
- Privacy Model: K with value 3, T-closeness with value 0.2, and L-diversity with value 2
In this example, for an attribute, the generalization hierarchy is a part of the request.
{
"source": {
"type": "File",
"file": {
"name": "samples/adult.csv",
"props": {
"sep": ";",
"decimal": ",",
"quotechar": "\"",
"escapechar": "\\",
"encoding": "utf-8"
}
}
},
"attributes": [
{
"name": "marital-status",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_marital-status.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "native-country",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_native-country.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "occupation",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_occupation.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "race",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data",
"data": {
"hierarchy": [
[
"White",
"*"
],
[
"Asian-Pac-Islander",
"*"
],
[
"Amer-Indian-Eskimo",
"*"
],
[
"Black",
"*"
]
],
"defaultHierarchy": [
"Other",
"*"
]
}
}
},
{
"name": "sex",
"dataType": "String",
"classificationType": "Sensitive Attribute"
},
{
"name": "salary-class",
"dataType": "String",
"classificationType": "Sensitive Attribute"
}
],
"config": {
"maxSuppression": 0.10
},
"privacyModel": {
"k": {
"kValue": 3
},
"tcloseness": [
{
"name": "salary-class",
"emdType": "EMD with equal ground distance",
"tFactor": 0.2
}
],
"ldiversity": [
{
"name": "sex",
"lFactor": 2,
"lType": "Distinct-l-diversity"
}
]
},
"target": {
"type": "File",
"file": {
"name": "s3://<Your-S3-BucketName>/anon-adult_klt.csv",
"props": {
"lineterminator": "\n"
},
"accessOptions": {
"key": "<Your-S3-API Key>",
"secret": "<Your-S3-API Secret>"
}
}
}
}
#import the anonsdk library
import anonsdk as asdk
import pandas as pd
# s3 bucket credentials
s3_key = <AWS_Key>
s3_secret = <AWS_Secret>
#set the source path for anonymization
# dataset path
source_csv_path = "adult.csv"
# create Store Object source_datastore
source_datastore = asdk.FileDataStore(source_csv_path)
#Set the target path for anonymized result
# anonymized file path
target_csv_path = "s3://target/anon-adult_klt.csv"
# create Store Object target_datastore
target_datastore = asdk.FileDataStore(target_csv_path, access_options={"key": s3_key,"secret": s3_secret})
# Create connection Object with Rest API server
conn = asdk.Connection("https://anon.protegrity.com/")
# create AnonObject with connection, dataframe metadata and source path
df = pd.read_csv(source_csv_path,sep=";")
df.head()
anon_object = asdk.AnonElement(conn, df, source_datastore)
# configuration
hierarchy_marital_status_path = "samples/hierarchy/adult_hierarchy_marital-status.csv"
df_ms = pd.read_csv(hierarchy_marital_status_path,sep=";").compute()
print(df_ms)
anon_object['marital-status']=asdk.Gen_Tree(df_ms)
hierarchy_native_country_path = "samples/hierarchy/adult_hierarchy_native-country.csv"
df_nc = pd.read_csv(hierarchy_native_country_path,sep=";").compute()
print(df_nc)
anon_object['nativecountry']=asdk.Gen_Tree(df_nc)
hierarchy_occupation_path = "hierarchy/adult_hierarchy_occupation.csv"
df_occ = pd.read_csv(hierarchy_occupation_path).compute()
print(df_occ)
anon_object['occupation']=asdk.Gen_Tree(df_occ)
df_race = pd.DataFrame(data={"lvl0":["White","Asian-Pac-Islander","Amer-Indian","Black","Other"], "lvl1":["*","*","*","*","*"]})
anon_object['race']=asdk.Gen_Tree(df_race)
#Configure K-anonymity , suppression allowed in the dataset
anon_object.config.k = asdk.K(3)
anon_object.config['maxSuppression'] = 0.10
#Configure L-diversity and T-closeness
anon_object["sex"]=asdk.LDiv(lfactor=2)
anon_object["salary-class"]=asdk.TClose(tfactor=0.2)
# Send Anonymization request with Transformation Configuration with the target store
job = asdk.anonymize(anon_object,target_datastore ,force=True)
# check the status of the job
job.status()
# check the comparative risk statistics from the source and result dataset
job.riskStat()
# check the comparative utility statistics from the source and result dataset
job.utilityStat()
Micro-Aggregation and Generalization with Aggregates
This sample uses the following attributes:
- Source: Local file system
- Target: Amazon S3 bucket
- Data set: 2 Quasi Identifiers, 1 Aggregation-based Quasi Identifier, 2 Micro Aggregations, and 2 Sensitive Attributes
- Suppression: 0.50
- Privacy Model: K with value 5, T-closeness with value 0.2, and L-diversity with value 2
{
"source": {
"type": "File",
"file": {
"name": "samples/adult.csv",
"props": {
"sep": ";"
}
}
},
"attributes": [
{
"name": "age",
"dataType": "Integer",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Micro Aggregation",
"aggregateFn": "GMean"
},
{
"name": "marital-status",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Micro Aggregation",
"aggregateFn": "Mode"
},
{
"name": "native-country",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_native-country.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "occupation",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_occupation.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "race",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Aggregation Based",
"hierarchyType": "Aggregate",
"aggregateFn": "Mode"
}
},
{
"name": "sex",
"classificationType": "Sensitive Attribute",
"dataType": "String"
},
{
"name": "salary-class",
"classificationType": "Sensitive Attribute",
"dataType": "String"
}
],
"config": {
"maxSuppression": 0.50
},
"privacyModel": {
"k": {
"kValue": 5
},
"tcloseness": [
{
"name": "salary-class",
"emdType": "EMD with equal ground distance",
"tFactor": 0.2
}
],
"ldiversity": [
{
"name": "sex",
"lType": "Distinct-l-diversity",
"lFactor": 2
}
]
},
"target": {
"type": "File",
"file": {
"name": "s3://<Your-S3-BucketName>/anon-adult_micro.csv",
"props": {
"lineterminator": "\n"
},
"accessOptions": {
"key": "<Your-S3-API Key>",
"secret": "<Your-S3-API Secret>"
}
}
}
}
#import the anonsdk library
import anonsdk as asdk
import pandas as pd
# s3 bucket credentials
s3_key = <AWS_Key>
s3_secret = <AWS_Secret>
#set the source path for anonymization
# dataset path
source_csv_path = "adult.csv"
# create Store Object source_datastore
source_datastore = asdk.FileDataStore(source_csv_path)
#Set the target path for anonymized result
# anonymized file path
target_csv_path = "s3://target/anon-adult_micro.csv"
# create Store Object target_datastore
target_datastore = asdk.FileDataStore(target_csv_path, access_options={"key": s3_key,"secret": s3_secret})
# Create connection Object with Rest API server
conn = asdk.Connection("https://anon.protegrity.com/")
df = pd.read_csv(source_csv_path,sep=";")
df.head()
# create AnonObject with connection, dataframe metadata and source path
anon_object = asdk.AnonElement(conn, df, source_datastore)
# configuration
hierarchy_native_country_path = "hierarchy/adult_hierarchy_native-country.csv"
df_nc = pd.read_csv(hierarchy_native_country_path,sep=";")
print(df_nc)
anon_object['nativecountry']=asdk.Gen_Tree(df_nc)
hierarchy_occupation_path = "samples/hierarchy/adult_hierarchy_occupation.csv"
df_occ = pd.read_csv(hierarchy_occupation_path)
print(df_occ)
anon_object['marital-status']=asdk.Gen_Tree(df_occ)
# applying aggregation rules
anon_object['age']=asdk.MicroAgg(asdk.AggregateFunction.GMean)
anon_object['race']=asdk.Gen_Agg(asdk.AggregateFunction.Mode)
# applying micro-aggregation rule
anon_object['marital-status']=asdk.MicroAgg(asdk.AggregateFunction.Mode)
#Configure K-anonymity , suppression in the dataset allowed
anon_object.config.k = asdk.K(5)
anon_object.config['maxSuppression'] = 0.50
#Configure L-diversity and T-closeness
anon_object["sex"]=asdk.LDiv(lfactor=2)
anon_object["salary-class"]=asdk.TClose(tfactor=0.2)
# Send Anonymization request with Transformation Configuration with the target store
job = asdk.anonymize(anon_object,target_datastore ,force=True)
# check the status of the job
job.status()
# check the comparative risk statistics from the source and result dataset
job.riskStat()
# check the comparative utility statistics from the source and result dataset
job.utilityStat()
Parquet File Format
This sample uses the following attributes:
- Source: Local file system
- Target: Amazon S3 bucket in the Parquet format
- Data set: 4 Quasi Identifiers, 1 Aggregation-based Quasi Identifier, 1 Micro Aggregation, and 1 Sensitive Attribute
- Suppression: 0.4
- Privacy Model: K with value 350 and L-diversity with value 2
In this example, for an attribute, the generalization hierarchy is part of the request.
{
"source": {
"type": "File",
"file": {
"name": "samples/adult.csv",
"props": {
"sep": ";",
"decimal": ",",
"quotechar": "\"",
"escapechar": "\\",
"encoding": "utf-8"
}
}
},
"attributes": [
{
"name": "age",
"dataType": "Integer",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"hierarchyType": "Rule",
"type": "Rounding",
"rule": {
"interval": {
"levels": [
"5",
"10",
"50",
"100"
],
"lowerBound":"5",
"upperBound":"100"
}
}
}
},
{
"name": "marital-status",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Micro Aggregation",
"aggregateFn": "Mode"
},
{
"name": "citizenSince",
"dataType": "Date",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Rounding",
"hierarchyType": "Rule",
"rule": {
"daterange": {
"levels": [
"WD.M.Y",
"FD.M.Y",
"QTR.Y",
"Y"
]
}
}
},
"props": {
"dateformat": "dd-mm-yyyy"
}
},
{
"name": "occupation",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_occupation.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "race",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"dataType": "String",
"generalization": {
"type": "Aggregation Based",
"hierarchyType": "Aggregate",
"aggregateFn": "Mode"
}
},
{
"name": "salary-class",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Masking Based",
"hierarchyType": "Rule",
"rule": {
"masking": {
"maskOrder": "Left To Right",
"maskChar": "*",
"maxDomainSize": 3
}
}
}
},
{
"name": "sex",
"dataType": "String",
"classificationType": "Sensitive Attribute"
}
],
"config": {
"maxSuppression": 0.4,
"redactOutliers": true,
"suppressionData": "Any"
},
"privacyModel": {
"k": {
"kValue": 350
},
"ldiversity": [
{
"name": "sex",
"lType": "Distinct-l-diversity",
"lFactor": 2
}
]
},
"target": {
"type": "File",
"file": {
"name": "s3://<Your-S3-BucketName>/anon-adult-rules",
"format": "Parquet",
"accessOptions": {
"key": "<Your-S3-API Key>",
"secret": "<Your-S3-API Secret>"
}
}
}
}
It is not applicable for SDK functions.
Retaining and Redacting
This sample uses the following attributes:
- Source: Local file system
- Target: Amazon S3 bucket in the Parquet format
- Data set: 2 Quasi Identifiers, 1 Aggregation-based Quasi Identifier, 1 Micro Aggregation, 1 Non-Sensitive Attribute, 1 Identifying Attribute, and 2 Sensitive Attributes
- Suppression: 0.10
- Privacy Model: K with value 200 and L-diversity with value 2
In this example, for an attribute, the generalization hierarchy is part of the request.
{
"source": {
"type": "File",
"file": {
"name": "samples/adult.csv",
"props": {
"sep": ";",
"decimal": ",",
"quotechar": "\"",
"escapechar": "\\",
"encoding": "utf-8"
}
}
},
"attributes": [
{
"name": "age",
"dataType": "Integer",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Rounding",
"hierarchyType": "Rule",
"rule": {
"interval": {
"levels": [
"5",
"10",
"50",
"100"
]
}
}
}
},
{
"name": "marital-status",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Micro Aggregation",
"aggregateFn": "Mode"
},
{
"name": "occupation",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Tree Based",
"hierarchyType": "Data Store",
"dataStore": {
"type": "File",
"format": "CSV",
"file": {
"name": "samples/hierarchy/adult_hierarchy_occupation.csv",
"props": {
"delimiter": ";",
"quotechar": "\"",
"header": null
}
}
}
}
},
{
"name": "race",
"dataType": "String",
"classificationType": "Quasi Identifier",
"dataTransformationType": "Generalization",
"generalization": {
"type": "Aggregation Based",
"hierarchyType": "Aggregate",
"aggregateFn": "Mode"
}
},
{
"name": "citizenSince",
"dataType": "Date",
"classificationType": "Identifying Attribute"
},
{
"name": "education",
"dataType": "String",
"classificationType": "Non-Sensitive Attribute"
},
{
"name": "salary-class",
"dataType": "String",
"classificationType": "Sensitive Attribute"
},
{
"name": "sex",
"dataType": "String",
"classificationType": "Sensitive Attribute"
}
],
"config": {
"maxSuppression": 0.10,
"suppressionData": "Any"
},
"privacyModel": {
"k": {
"kValue": 200
},
"ldiversity": [
{
"name": "sex",
"lType": "Distinct-l-diversity",
"lFactor": 2
},
{
"name": "salary-class",
"lType": "Distinct-l-diversity",
"lFactor": 2
}
]
},
"target": {
"type": "File",
"file": {
"name": "s3://<Your-S3-BucketName>/anon-adult_retd",
"format": "Parquet",
"accessOptions": {
"key": "<Your-S3-API Key>",
"secret": "<Your-S3-API Secret>"
}
}
}
}
# import the anonsdk library
import anonsdk as asdk
import pandas as pd
# s3 bucket credentials
s3_key = < AWS_Key >
s3_secret = < AWS_Secret >
# set the source path for anonymization
# dataset path
source_csv_path = "adult.csv"
# create Store Object source_datastore
source_datastore = asdk.FileDataStore(source_csv_path)
# Set the target path for anonymized result
# anonymized file path
target_csv_path = "s3://target/anon-adult_retd"
# create Store Object target_datastore
target_datastore = asdk.FileDataStore(target_csv_path, access_options={"key": s3_key, "secret": s3_secret})
# Create connection Object with Rest API server
conn = asdk.Connection("https://anon.protegrity.com/")
df = pd.read_csv(source_csv_path, sep=";")
df.head()
# create AnonObject with connection, dataframe metadata and source path
anon_object = asdk.AnonElement(conn, df, source_datastore)
# configuration
hierarchy_occupation_path = "samples/hierarchy/adult_hierarchy_occupation.csv"
df_occ = pd.read_csv(hierarchy_occupation_path, sep=";")
print(df_occ)
anon_object['marital-status'] = asdk.Gen_Tree(df_occ)
anon_object['marital-status'] = asdk.MicroAgg(asdk.AggregateFunction.Mode)
anon_object['race'] = asdk.Gen_Agg(asdk.AggregateFunction.Mode)
anon_object['age'] = asdk.Gen_Interval([5, 10, 50, 100])
anon_object['citizenSince'] = asdk.Preserve()
anon_object['education'] = asdk.Preserve()
anon_object['salary-class'] = asdk.Redact()
anon_object['sex'] = asdk.Redact()
# Configure K-anonymity , suppression in the dataset allowed
anon_object.config.k = asdk.K(200)
anon_object.config['maxSuppression'] = 0.10
# Configure L-diversity
anon_object["sex"] = asdk.LDiv(lfactor=2)
anon_object["salary-class"] = asdk.LDiv(lfactor=2)
# Send Anonymization request with Transformation Configuration with the target store
job = asdk.anonymize(anon_object, target_datastore, force=True)
# check the status of the job
job.status()
# check the comparative risk statistics from the source and result dataset
job.riskStat()
# check the comparative utility statistics from the source and result dataset
job.utilityStat()
Feedback
Was this page helpful?