Ingestion of a single table using:
- Spark SQL from
s3://<bucket>/query/<db>/<table>.sql - Single configuration from
s3://<bucket>/orquestradores/<db>/<table>.json - Single target:
s3,sns,dynamo, orrds.
glue_orq/
├─ glue_job.py
└─ orq/
├─ __init__.py
├─ models.py
├─ config_loader.py
├─ query_loader.py
├─ source_reader.py
├─ orchestrator.py
├─ utils/
│ └─ spark_session.py
└─ targets/
├─ s3_writer.py
├─ sns_writer.py
├─ dynamo_writer.py
└─ rds_writer.py
{
"source": {
"type": "glue",
"options": { "database": "sampledb", "table": "customers" }
},
"target": {
"type": "s3",
"options": {
"path": "s3://my-bucket/datalake/curated/customers/",
"format": "parquet",
"mode": "overwrite",
"partitionBy": ["country"]
}
},
"repartition": 8,
"temp_view": "src"
}SELECT *
FROM src
WHERE active = true;--JOB_NAME glue-orq-customers
--S3_BUCKET my-bucket
--DB_NAME sampledb
--TABLE_NAME customers
--CONF_PREFIX orquestradores
--QUERY_PREFIX query
The package is imported using the alias:
import orq as oq. For JDBC/RDS targets, make sure to include the appropriate driver.