Skip to content

Commit c25193f

Browse files
authored
Merge pull request #140 from rwnx/cli-refactor
inject progress
2 parents 68cfa95 + fb1fe02 commit c25193f

File tree

12 files changed

+61
-91
lines changed

12 files changed

+61
-91
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1919
### Added
2020
- Better error messages for unsupported fake types - the error should now explain the problem and link to the docs in the right section. [#133]
2121
- Error message for when a fake_type is used with the wrong config kwargs (these would have previously been caught under "unsupported fake types")
22+
- event hooks for progress events. you can now use your own progress bar if you're invoking the process via the python interface.
2223

2324
### Removed
2425
- Positional INPUT. Use the -i/--input option instead

pynonymizer/cli.py

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ProcessSteps,
1414
)
1515
from pynonymizer import __version__
16+
from tqdm import tqdm
1617

1718

1819
app = typer.Typer()
@@ -197,6 +198,7 @@ def main(
197198
dry_run=dry_run,
198199
verbose=verbose,
199200
ignore_anonymization_errors=ignore_anonymization_errors,
201+
progress=tqdm,
200202
)
201203
except ModuleNotFoundError as error:
202204
if error.name == "pyodbc" and db_type == "mssql":

pynonymizer/database/__init__.py

-27
Original file line numberDiff line numberDiff line change
@@ -1,27 +0,0 @@
1-
import os
2-
import uuid
3-
from pynonymizer.database.mysql import MySqlProvider
4-
from pynonymizer.database.mssql import MsSqlProvider
5-
from pynonymizer.database.postgres import PostgreSqlProvider
6-
from pynonymizer.database.exceptions import UnknownDatabaseTypeError
7-
8-
9-
def get_temp_db_name(filename=None):
10-
name, _ = os.path.splitext(os.path.basename(filename))
11-
return f"{name}_{uuid.uuid4().hex}"
12-
13-
14-
def get_provider(type, *args, **kwargs):
15-
provider = None
16-
17-
if type == "mysql":
18-
provider = MySqlProvider
19-
if type == "mssql":
20-
provider = MsSqlProvider
21-
elif type == "postgres":
22-
provider = PostgreSqlProvider
23-
24-
if provider:
25-
return provider(*args, **kwargs)
26-
else:
27-
raise UnknownDatabaseTypeError(type)

pynonymizer/database/basic/__init__.py

-7
This file was deleted.
File renamed without changes.

pynonymizer/database/mssql/__init__.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from pynonymizer.database.provider import DatabaseProvider
21
from pynonymizer.database.provider import SEED_TABLE_NAME
32
from pynonymizer.strategy.update_column import UpdateColumnStrategyTypes
43
from pynonymizer.strategy.table import TableStrategyTypes
@@ -11,7 +10,6 @@
1110

1211
import math
1312
import logging
14-
from tqdm import tqdm
1513
from pathlib import PureWindowsPath, PurePosixPath
1614
import re
1715

@@ -34,7 +32,7 @@ def _extract_driver_version(driver):
3432
return 0
3533

3634

37-
class MsSqlProvider(DatabaseProvider):
35+
class MsSqlProvider:
3836
"""
3937
A pyodbc-based MSSQL provider.
4038
"""
@@ -53,6 +51,7 @@ def __init__(
5351
db_pass,
5452
db_name,
5553
seed_rows,
54+
progress,
5655
db_port=None,
5756
backup_compression=False,
5857
driver=None,
@@ -69,6 +68,7 @@ def __init__(
6968
self.db_pass = db_pass
7069
self.db_name = db_name
7170
self.db_port = db_port
71+
self.progress = progress
7272

7373
self.seed_rows = int(seed_rows)
7474

@@ -217,7 +217,9 @@ def __async_operation_progress(self, desc, cursor):
217217
# With STATS=x, we should recieve 100/x resultsets, provided the backup is slow enough.
218218
# With some databases, it will jump from y% to 100, so we'll only get <x nextset calls.
219219
# Even SSMS doesn't get informed (read: it's not my fault, blame microsoft)
220-
with tqdm(desc=desc, total=math.floor(100 / self.__STATS)) as progressbar:
220+
with self.progress(
221+
desc=desc, total=math.floor(100 / self.__STATS)
222+
) as progressbar:
221223
while cursor.nextset():
222224
progressbar.update()
223225

@@ -266,7 +268,7 @@ def __insert_seed_row(self, qualifier_map):
266268
self.__db_execute(statement, value_list)
267269

268270
def __seed(self, qualifier_map):
269-
for i in tqdm(
271+
for i in self.progress(
270272
range(0, self.seed_rows), desc="Inserting seed data", unit="rows"
271273
):
272274
self.__insert_seed_row(qualifier_map)
@@ -318,7 +320,7 @@ def anonymize_database(self, database_strategy):
318320

319321
anonymization_errors = []
320322

321-
with tqdm(
323+
with self.progress(
322324
desc="Anonymizing database", total=len(table_strategies)
323325
) as progressbar:
324326
for table_strategy in table_strategies:

pynonymizer/database/mysql/__init__.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
from tqdm import tqdm
21
from time import sleep
32
import logging
4-
from pynonymizer.database.provider import DatabaseProvider, SEED_TABLE_NAME
3+
from pynonymizer.database.input import resolve_input
4+
from pynonymizer.database.output import resolve_output
5+
from pynonymizer.database.provider import SEED_TABLE_NAME
56
from pynonymizer.database.exceptions import UnsupportedTableStrategyError
67
from pynonymizer.database.mysql import execution, query_factory
7-
from pynonymizer.database.basic.input import resolve_input
8-
from pynonymizer.database.basic.output import resolve_output
98
from pynonymizer.strategy.table import TableStrategyTypes
109

1110

12-
class MySqlProvider(DatabaseProvider):
11+
class MySqlProvider:
1312
"""
1413
A command-line based mysql provider. Uses `mysql` and `mysqldump`,
1514
Because of the efficiency of piping mass amounts of sql into the command-line client.
@@ -27,6 +26,7 @@ def __init__(
2726
db_pass,
2827
db_name,
2928
seed_rows,
29+
progress,
3030
db_port=None,
3131
cmd_opts=None,
3232
dump_opts=None,
@@ -45,6 +45,7 @@ def __init__(
4545
self.db_pass = db_pass
4646
self.db_name = db_name
4747
self.db_port = db_port
48+
self.progress = progress
4849

4950
self.seed_rows = int(seed_rows)
5051

@@ -59,7 +60,7 @@ def __seed(self, qualifier_map):
5960
"""
6061
'Seed' the database with a bunch of pre-generated random records so updates can be performed in batch updates
6162
"""
62-
for i in tqdm(
63+
for i in self.progress(
6364
range(0, self.seed_rows), desc="Inserting seed data", unit="rows"
6465
):
6566
self.logger.debug(f"Inserting seed row {i}")
@@ -122,7 +123,7 @@ def anonymize_database(self, database_strategy):
122123

123124
anonymization_errors = []
124125

125-
with tqdm(
126+
with self.progress(
126127
desc="Anonymizing database", total=len(table_strategies)
127128
) as progressbar:
128129
for table_strategy in table_strategies:
@@ -196,7 +197,7 @@ def restore_database(self, input_path):
196197
try:
197198
batch_processor = self.__runner.open_batch_processor()
198199
with input_obj.open() as dumpfile_data:
199-
with tqdm(
200+
with self.progress(
200201
desc="Restoring",
201202
total=dumpsize,
202203
unit="B",
@@ -221,7 +222,7 @@ def dump_database(self, output_path):
221222

222223
dump_process = self.__dumper.open_dumper()
223224
with output_obj.open() as output_file:
224-
with tqdm(
225+
with self.progress(
225226
desc="Dumping",
226227
total=dumpsize_estimate,
227228
unit="B",
File renamed without changes.

pynonymizer/database/postgres/__init__.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
from pynonymizer.database.provider import SEED_TABLE_NAME
2-
from tqdm import tqdm
32
import logging
4-
from pynonymizer.database.provider import DatabaseProvider
53
from pynonymizer.database.exceptions import UnsupportedTableStrategyError
64
from pynonymizer.database.postgres import execution, query_factory
7-
from pynonymizer.database.basic.input import resolve_input
8-
from pynonymizer.database.basic.output import resolve_output
5+
from pynonymizer.database.input import resolve_input
6+
from pynonymizer.database.output import resolve_output
97
from pynonymizer.strategy.table import TableStrategyTypes
108

119

12-
class PostgreSqlProvider(DatabaseProvider):
10+
class PostgreSqlProvider:
1311
"""
1412
A command-line based postgres provider. Uses `psql` and `pg_dump`,
1513
because of the efficiency of piping mass amounts of sql into the command-line client.
@@ -26,6 +24,7 @@ def __init__(
2624
db_pass,
2725
db_name,
2826
seed_rows,
27+
progress,
2928
db_port=None,
3029
cmd_opts=None,
3130
dump_opts=None,
@@ -45,6 +44,7 @@ def __init__(
4544
self.db_pass = db_pass
4645
self.db_name = db_name
4746
self.db_port = db_port
47+
self.progress = progress
4848

4949
self.seed_rows = int(seed_rows)
5050

@@ -69,7 +69,7 @@ def __seed(self, qualifier_map):
6969
"""
7070
'Seed' the database with a bunch of pre-generated random records so updates can be performed in batch updates
7171
"""
72-
for i in tqdm(
72+
for i in self.progress(
7373
range(0, self.seed_rows), desc="Inserting seed data", unit="rows"
7474
):
7575
self.logger.debug(f"Inserting seed row {i}")
@@ -132,7 +132,7 @@ def anonymize_database(self, database_strategy):
132132

133133
anonymization_errors = []
134134

135-
with tqdm(
135+
with self.progress(
136136
desc="Anonymizing database", total=len(table_strategies)
137137
) as progressbar:
138138
for table_strategy in table_strategies:
@@ -195,7 +195,7 @@ def restore_database(self, input_path):
195195
batch_processor = self.__runner.open_batch_processor()
196196
try:
197197
with input_obj.open() as dumpfile_data:
198-
with tqdm(
198+
with self.progress(
199199
desc="Restoring",
200200
total=dumpsize,
201201
unit="B",
@@ -220,7 +220,7 @@ def dump_database(self, output_path):
220220

221221
dump_process = self.__dumper.open_dumper()
222222
with output_obj.open() as output_file:
223-
with tqdm(
223+
with self.progress(
224224
desc="Dumping",
225225
total=dumpsize_estimate,
226226
unit="B",

pynonymizer/database/provider.py

-24
Original file line numberDiff line numberDiff line change
@@ -1,25 +1 @@
1-
from abc import ABC, abstractmethod
2-
31
SEED_TABLE_NAME = "_pynonymizer_seed_fake_data"
4-
5-
6-
class DatabaseProvider(ABC):
7-
@abstractmethod
8-
def create_database(self):
9-
pass
10-
11-
@abstractmethod
12-
def drop_database(self):
13-
pass
14-
15-
@abstractmethod
16-
def anonymize_database(self, database_strategy):
17-
pass
18-
19-
@abstractmethod
20-
def restore_database(self, input_obj):
21-
pass
22-
23-
@abstractmethod
24-
def dump_database(self, output_obj):
25-
pass

pynonymizer/pynonymize.py

+29-7
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,27 @@
1+
from dataclasses import dataclass
12
import logging
2-
from pynonymizer.database import get_temp_db_name, get_provider
3+
from typing import Callable, Literal, Union
4+
from pynonymizer.database.mssql import MsSqlProvider
5+
from pynonymizer.database.mysql import MySqlProvider
6+
from pynonymizer.database.postgres import PostgreSqlProvider
37
from pynonymizer.fake import FakeColumnGenerator
48
from pynonymizer.strategy.parser import StrategyParser
59
from pynonymizer.strategy.config import read_config
610
from pynonymizer.exceptions import ArgumentValidationError, DatabaseConnectionError
711
from pynonymizer.process_steps import StepActionMap, ProcessSteps
8-
12+
import os.path
13+
import uuid
914

1015
logger = logging.getLogger(__name__)
1116

1217

18+
def get_temp_db_name(filename=None):
19+
name, _ = os.path.splitext(os.path.basename(filename))
20+
return f"{name}_{uuid.uuid4().hex}"
21+
22+
1323
def pynonymize(
24+
progress,
1425
input_path=None,
1526
strategyfile_path=None,
1627
output_path=None,
@@ -37,6 +48,7 @@ def pynonymize(
3748
ArgumentValidationError: used when kwargs are missing or unable to be auto-resolved.
3849
3950
"""
51+
4052
# Default and Normalize args
4153
if only_step is not None:
4254
only_step = ProcessSteps.from_value(only_step)
@@ -97,9 +109,6 @@ def pynonymize(
97109
if db_name is None:
98110
validations.append("Missing DB_NAME: Auto-resolve failed.")
99111

100-
if len(validations) > 0:
101-
raise ArgumentValidationError(validations)
102-
103112
# init strategy as it relies on I/O - fail fast here preferred to after restore
104113
if not actions.skipped(ProcessSteps.ANONYMIZE_DB):
105114
strategy_parser = StrategyParser()
@@ -118,14 +127,27 @@ def pynonymize(
118127
logger.debug(
119128
"Database: (%s:%s)%s@%s name: %s", db_host, db_port, db_type, db_user, db_name
120129
)
121-
db_provider = get_provider(
122-
type=db_type,
130+
131+
if db_type == "mysql":
132+
Provider = MySqlProvider
133+
elif db_type == "postgres":
134+
Provider = PostgreSqlProvider
135+
elif db_type == "mssql":
136+
Provider = MsSqlProvider
137+
else:
138+
validations.append(f"{db_type} is not a known database type.")
139+
140+
if len(validations) > 0:
141+
raise ArgumentValidationError(validations)
142+
143+
db_provider = Provider(
123144
db_host=db_host,
124145
db_user=db_user,
125146
db_pass=db_password,
126147
db_name=db_name,
127148
db_port=db_port,
128149
seed_rows=seed_rows,
150+
progress=progress,
129151
**db_kwargs,
130152
)
131153

tests/database/test_input_output.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
from pynonymizer.database.basic.input import (
1+
from pynonymizer.database.input import (
22
GzipInput,
33
RawInput,
44
StdInInput,
55
UnknownInputTypeError,
66
resolve_input,
77
)
8-
from pynonymizer.database.basic.output import (
8+
from pynonymizer.database.output import (
99
UnknownOutputTypeError,
1010
resolve_output,
1111
RawOutput,

0 commit comments

Comments
 (0)