Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/test-and-deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Run Tests & Deploy
on: [push, pull_request]

jobs:
build-and-test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
services:
postgres:
image: postgres
env:
POSTGRES_PASSWORD: postgres
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install -e '.'
- name: Install packages needed for CI
run: pip install pre-commit
- name: Lint all files
run: pre-commit run --all-files --show-diff-on-failure
- name: Run tests
run: python setup.py test
deploy:
runs-on: ubuntu-latest
needs: build-and-test
if: github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
token: ${{ secrets.PUBLISH_TOKEN }}
- name: Semantic release
uses: relekang/python-semantic-release@master
with:
# Personal Access Token that belongs to an admin of the repo must
# be set in PUBLISH_TOKEN secret to bypass `main` branch protection
github_token: ${{ secrets.PUBLISH_TOKEN }}
22 changes: 21 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.1.4
rev: v0.1.6
hooks:
# Run the linter.
- id: ruff
args: [ --fix ]
# Run the formatter.
- id: ruff-format
- repo: local
hooks:
- id: commitizen-branch
name: commitizen conditionally check branch
description: >
Commitizen fails when there are no new commits. To overcome this, we created
tiny wrapper that verifies there are commits to check.

Original description from Commitizen:
Check all commit messages that are already on the current branch but not the
default branch on the origin repository. Useful for checking messages after
the fact (e.g., pre-push or in CI) without an expensive check of the entire
repository history.
entry: ./conditional-commitizen.sh
always_run: true
language: python
minimum_pre_commit_version: "1.4.3"
additional_dependencies: [
'commitizen',
]
3 changes: 3 additions & 0 deletions conditional-commitizen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
log=$(git log origin/HEAD..HEAD | wc -l)
if [ "$log" -gt 0 ]; then cz check --rev-range origin/HEAD..HEAD; fi
50 changes: 33 additions & 17 deletions pgantomizer/anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,9 @@ def check_schema(cursor, schema, db_args):


def get_column_update(schema, table, column, data_type):
custom_rule = get_in(schema, [table, "custom_rules", column]) if schema[table] else None
custom_rule = (
get_in(schema, [table, "custom_rules", column]) if schema[table] else None
)

if column == get_table_pk_name(schema, table) or (
schema[table] and column in schema[table].get("raw", [])
Expand All @@ -190,7 +192,9 @@ def get_column_update(schema, table, column, data_type):
'Custom rule "{}" must provide a non-None value'.format(custom_rule)
)
else:
return "{column} = '{value}'".format(column=column, value=custom_rule["value"])
return "{column} = '{value}'".format(
column=column, value=custom_rule["value"]
)
elif custom_rule and custom_rule not in CUSTOM_ANONYMIZATION_RULES:
raise MissingAnonymizationRuleError(
'Custom rule "{}" is not defined'.format(custom_rule)
Expand Down Expand Up @@ -222,9 +226,13 @@ def anonymize_table(conn, cursor, schema, table, disable_schema_changes):
cascade = " CASCADE"

logging.debug(
"Running TRUNCATE{cascade} on {table} ...".format(table=table, cascade=cascade)
"Running TRUNCATE{cascade} on {table} ...".format(
table=table, cascade=cascade
)
)
cursor.execute(
"TRUNCATE {table} {cascade}".format(table=table, cascade=cascade)
)
cursor.execute("TRUNCATE {table} {cascade}".format(table=table, cascade=cascade))
return

# Generate list of column_update SQL snippets for UPDATE
Expand All @@ -236,20 +244,24 @@ def anonymize_table(conn, cursor, schema, table, disable_schema_changes):
updated_column_names = []
for column_name, data_type in cursor.fetchall():
if not disable_schema_changes: # Bypass schema changes if explicitly requested
prepare_column_for_anonymization(conn, cursor, table, column_name, data_type)
prepare_column_for_anonymization(
conn, cursor, table, column_name, data_type
)
column_update = get_column_update(schema, table, column_name, data_type)
if column_update is not None:
column_updates.append(column_update)
updated_column_names.append(column_name)

# Process UPDATE if any column_updates requested
if len(column_updates) > 0:
update_statement = "UPDATE {table} SET {column_updates_sql} {where_clause}".format(
table=table,
column_updates_sql=", ".join(column_updates),
where_clause="WHERE {}".format(
schema[table].get("where", "TRUE") if schema[table] else "TRUE"
),
update_statement = (
"UPDATE {table} SET {column_updates_sql} {where_clause}".format(
table=table,
column_updates_sql=", ".join(column_updates),
where_clause="WHERE {}".format(
schema[table].get("where", "TRUE") if schema[table] else "TRUE"
),
)
)
logging.debug(
"Running UPDATE on {} for columns {} ...".format(
Expand All @@ -269,7 +281,9 @@ def anonymize_db(schema, db_args, disable_schema_changes):
"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_type <> 'VIEW' ORDER BY table_name;"
)
for table_name in cursor.fetchall():
anonymize_table(conn, cursor, schema, table_name[0], disable_schema_changes)
anonymize_table(
conn, cursor, schema, table_name[0], disable_schema_changes
)
logging.debug("Anonymization complete!")


Expand All @@ -291,9 +305,7 @@ def load_anonymize_remove(
try:
load_db_to_new_instance(dump_file, db_args)
anonymize_db(schema, db_args, disable_schema_changes)
except (
Exception
): # Any exception must result into dropping the schema to prevent sensitive data leakage
except Exception: # Any exception must result into dropping the schema to prevent sensitive data leakage
drop_schema(db_args)
raise
finally:
Expand All @@ -308,7 +320,9 @@ def main():
epilog="Beware that all tables in the target DB are dropped "
"prior to loading the dump and anonymization. See README.md for details.",
)
parser.add_argument("-v", "--verbose", action="count", help="increase output verbosity")
parser.add_argument(
"-v", "--verbose", action="count", help="increase output verbosity"
)
parser.add_argument(
"-s",
"--skip-restore",
Expand Down Expand Up @@ -349,7 +363,9 @@ def main():
help="password of the Postgres user with access to the anonymized database",
default="",
)
parser.add_argument("--host", help="host where the DB is running", default="localhost")
parser.add_argument(
"--host", help="host where the DB is running", default="localhost"
)
parser.add_argument("--port", help="port where the DB is running", default="5432")

args = parser.parse_args()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
],
tests_require=[
"pytest",
"pytest-postgresql==1.3.0",
"pytest-postgresql",
"pre-commit",
],
entry_points={
Expand Down
3 changes: 0 additions & 3 deletions tests/asserts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from psycopg2.extras import NamedTupleCursor


def assert_customer_anonymized(customer, name, language, currency, ip):
assert customer[1] == name
assert customer[2] == language
Expand Down
4 changes: 3 additions & 1 deletion tests/test_pgantomizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def test_dump_and_load(original_db, anonymized):

def test_load_anonymize_remove(dumped_db, anonymized):
assert_db_empty(anonymized)
load_anonymize_remove(DUMP_PATH, SCHEMA_PATH, leave_dump=False, db_args=ANONYMIZED_DB_ARGS)
load_anonymize_remove(
DUMP_PATH, SCHEMA_PATH, leave_dump=False, db_args=ANONYMIZED_DB_ARGS
)
assert_db_anonymized(anonymized)


Expand Down