Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create initial version of notebooks #862

Merged
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
lint fix
sreedhar-guda committed Nov 19, 2024
commit 050419eac4652164d8101b66ac577735f685e017
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -54,6 +54,7 @@ repos:
name: mypy (static type checker for Python)
additional_dependencies:
- types-requests
- types-PyYAML

- repo: https://github.com/adrienverge/yamllint.git
rev: v1.35.1
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
dependencies:
- pip:
- jupyter-black==0.3.4
- black==24.4.2
- pytest==8.2.2
- jupyter-black==0.4.0
- pytest==8.3.3
- ipytest==0.14.2
- opentelemetry-exporter-otlp==1.25.0
- azure-monitor-opentelemetry-exporter==1.0.0b32
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#############################################
# NOTE:
# Use {parking_ws} and {parking_lakehouse} to indicate workspace and lakehouse names,
# these values will be replaced during runtime.
#
# Field Descriptions
# ------------------
# type: grouping indicator for tables
# table_name: table name with schema
# data_file: Path relative to Files/ in {parking_ws}.{parking_lakehouse}.
# If specified this data is loaded into the table name mentioned in table_name
# in overwrite mode. Ex: if path is "Files/data/dim_time.csv" then this should be
# set as "data/dim_time.csv"
# create_sql: sql statement to create the table. Only one statement is allowed.
# runs when data_file is not specified.
# When both data_file and create_sql are specified then data_file takes precedence.
# ##############################################################################
version: 0.1
description: "DDLs for Parking sensor Fabric e2e sample"
table_types:
- type: fact
tables:
- table_name: dw.fact_parking
description: "Fact table for parking data"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.dw.fact_parking (
dim_date_id STRING,
dim_time_id STRING,
dim_parking_bay_id STRING,
dim_location_id STRING,
dim_st_marker_id STRING,
status STRING,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- type: dimension
tables:
- table_name: dw.dim_st_marker
description: "Dimension table for {parking_ws}.{parking_lakehouse}.dw.dim_st_marker"
data_file:
create_sql: |
CREATE TABLE dw.dim_st_marker (
dim_st_marker_id STRING,
st_marker_id STRING,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- table_name: dw.dim_location
description: "Dimension table for {parking_ws}.{parking_lakehouse}.dw.dim_location"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.dw.dim_location (
dim_location_id STRING,
lat FLOAT,
lon FLOAT,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- table_name: dw.dim_parking_bay
description: "Dimension table for {parking_ws}.{parking_lakehouse}.dw.dim_parking_bay"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.dw.dim_parking_bay (
dim_parking_bay_id STRING,
bay_id INT,
marker_id STRING,
meter_id STRING,
rd_seg_dsc STRING,
rd_seg_id STRING,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- type: reference
tables:
- table_name: dw.dim_date
description: "Dimension table for {parking_ws}.{parking_lakehouse}.dw.dim_date"
data_file: data/dim_date.csv
- table_name: dw.dim_time
description: "Dimension table for {parking_ws}.{parking_lakehouse}.dw.dim_time"
data_file: data/dim_time.csv
- type: interim
tables:
- table_name: interim.parking_bay
description: "Dimension table for {parking_ws}.{parking_lakehouse}.interim.parking_bay"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.interim.parking_bay (
bay_id INT,
last_edit TIMESTAMP,
marker_id STRING,
meter_id STRING,
rd_seg_dsc STRING,
rd_seg_id STRING,
the_geom STRUCT<coordinates: ARRAY<ARRAY<ARRAY<ARRAY<DOUBLE>>>>, type: STRING>,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- table_name: interim.sensor
description: "Dimension table for {parking_ws}.{parking_lakehouse}.interim.sensor"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.interim.sensor (
bay_id INT,
st_marker_id STRING,
lat FLOAT,
lon FLOAT,
location STRUCT<coordinates: ARRAY<DOUBLE>, type: STRING>,
status STRING,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- type: error
tables:
- table_name: malformed.parking_bay
description: "Dimension table for {parking_ws}.{parking_lakehouse}.malformed.parking_bay"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.malformed.parking_bay (
bay_id INT,
last_edit TIMESTAMP,
marker_id STRING,
meter_id STRING,
rd_seg_dsc STRING,
rd_seg_id STRING,
the_geom STRUCT<coordinates: ARRAY<ARRAY<ARRAY<ARRAY<DOUBLE>>>>, type: STRING>,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
- table_name: malformed.sensor
description: "Dimension table for {parking_ws}.{parking_lakehouse}.malformed.sensor"
data_file:
create_sql: |
CREATE TABLE {parking_ws}.{parking_lakehouse}.malformed.sensor (
bay_id INT,
st_marker_id STRING,
lat FLOAT,
lon FLOAT,
location STRUCT<coordinates: ARRAY<DOUBLE>, type: STRING>,
status STRING,
load_id STRING,
loaded_on TIMESTAMP
)
USING DELTA
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# ##############################################################
# See - https://docs.python.org/3/library/configparser.html
# Tips:
# - ${section:option} can be used for interpolation of values from another section in the same file.
# - When creating stage specific resources like ADLS, whose names are globally unique
# - make sure we can create them i.e, those names are available.
# we can also have env specific sections like [dev] , [test] etc.,
# but it could cause maintenance issues and may not be ideal.
# Instead, define common sections and here make env specific derivations
# based on the input "env/stage" value in the code.
#
##############################################################

# Important:
# - DONOT use quotes for wrapping strings
# - All file paths used are relative to the attached lakehouse's "Files/" path
#

[DEFAULT]
service_name = <some unique service name. Ex: parking_sensors_fabric_e2e>
service_version = <service version numner>

# Parking sensor workspace name and id
parking_ws = {{parking_ws}}
parking_ws_id = {{parking_ws_id}}
# parking sensor lakehouse located in workspace specified in Parking Workspace name
# this will be passed as the default lakehouse during runtime
parking_lakehouse = {{parking_lakehouse}}

[keyvault]
name = {{key_vault_name}}
uri = https://${keyvault:name}.vault.azure.net/

[setup]
process_name = 010_setup
# ddl file path is
ddl_file = config/parking_sensor_ddls.yaml

[standardize]
process_name = 020_setup

[transform]
process_name = 030_transform

[otel]
appinsights_connection_name = {{appinsights-connection-name}}
801 changes: 801 additions & 0 deletions e2e_samples/fabric_dataops_sample/src/notebooks/000_setup.ipynb

Large diffs are not rendered by default.

134 changes: 134 additions & 0 deletions e2e_samples/fabric_dataops_sample/src/scripts/otel_monitor_invoker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# This examples uses advanced config using monotor exporters using:
# ref: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry-exporter#microsoft-opentelemetry-exporter-for-azure-monitor

# Simple configuration could also be done using as below in which case, tracer, logger and meter \
# are prefconfigured to send data to azure-monitor.
# ref: https://learn.microsoft.com/en-us/azure/azure-monitor/app/opentelemetry-enable?tabs=python
# ```
# from azure.monitor.opentelemetry import configure_azure_monitor

# # Configure the Distro to authenticate with Azure Monitor - without managed identity
# configure_azure_monitor(
# connection_string="your-connection-string"
# )

# # using a managed identity credential.
# configure_azure_monitor(
# connection_string="your-connection-string",
# credential=ManagedIdentityCredential(),
# )
# ```
# - https://learn.microsoft.com/en-us/python/api/overview/azure/monitor-opentelemetry-exporter-readme?view=azure-python-preview

import logging

# from azure.monitor.opentelemetry import configure_azure_monitor # We are using the\
# advanced configs shown below using AzureMonitor*Exporter.
from azure.monitor.opentelemetry.exporter import (
AzureMonitorLogExporter,
AzureMonitorMetricExporter,
AzureMonitorTraceExporter,
)

# metrics
# traces
from opentelemetry import metrics, trace
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource

# from opentelemetry.trace import SpanKind
# from opentelemetry.trace.status import StatusCode
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

# logs


logging.basicConfig(level=logging.INFO)


class OpenTelemetryAppInsightsExporter:
def __init__(self, conn_string: str) -> None:
"""
Initializes the OpenTelemetryExporter class.
Args:
conn_string (str): Azure AppInsights connection string.
"""
self.conn_string = conn_string
return None

def get_otel_tracer(self, trace_resource_attributes: dict, tracer_name: str = __name__) -> object:
"""
Creates and returns an OpenTelemetry tracer object.
Args:
trace_resource_attributes (dict): The OpenTelemetry resource attributes in dictionary format
tracer_name (str): The name of the tracer. Default is __name__.
Returns:
tracer: OpenTelemetry tracer object
"""
resource = Resource(attributes=trace_resource_attributes)
trace.set_tracer_provider(TracerProvider(resource=resource))
tracer = trace.get_tracer(__name__)
# Exporter to send data to AppInsights
trace_exporter = AzureMonitorTraceExporter(connection_string=self.conn_string)
span_processor = BatchSpanProcessor(trace_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)
return tracer

def get_otel_logger(
self, log_resource_attributes: dict, logger_name: str = __name__, add_console_handler: bool = True
) -> object:
"""
Creates and returns an OpenTelemetry logger object.
Args:
log_resource_attributes (dict): The OpenTelemetry resource attributes in dictionary format
logger_name (str): The name of the logger. Default is __name__.
add_console_handler (bool): Whether to add a console handler to the logger. Default is True.
Returns:
logger: OpenTelemetry logger object
"""
resource = Resource(attributes=log_resource_attributes)
log_exporter = AzureMonitorLogExporter(connection_string=self.conn_string)

logger_provider = LoggerProvider(resource=resource)
logger_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter))
handler = LoggingHandler(level=logging.INFO, logger_provider=logger_provider)
logging.getLogger().addHandler(handler) # Attach OTLP handler to root logger

logger = logging.getLogger(logger_name) # get namespaced logger

# # Create a console handler - Optional
if add_console_handler:
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
logger.addHandler(console_handler)

return logger

def get_otel_metrics(
self, metric_resource_attributes: dict, metric_name: str = __name__, metric_version: str = "0"
) -> object:
"""
Creates and returns an OpenTelemetry metrics object.
Args:
metric_resource_attributes (dict): The OpenTelemetry resource attributes in dictionary format
metric_name (str): The name of the metric. Default is __name__.
metric_version (str): The version of the metric. Default is "0".
Returns:
meter: OpenTelemetry meter object
"""
resource = Resource(attributes=metric_resource_attributes)
metrics_exporter = AzureMonitorMetricExporter(connection_string=self.conn_string)
metrics_reader = PeriodicExportingMetricReader(metrics_exporter)
metrics_provider = MeterProvider(resource=resource, metric_readers=[metrics_reader])
metrics.set_meter_provider(metrics_provider)
meter = metrics.get_meter_provider().get_meter(name=metric_name, version=metric_version)

return meter
21 changes: 16 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -2,16 +2,27 @@
name = "modern_data_estate"
version = "0.1.0"
description = "Modern Data Estate"
authors = [{ name = "Microsoft ISE Data Tech Domain", email = "your.email@example.com" }]
authors = [
{ name = "Microsoft ISE Data Tech Domain", email = "your.email@example.com" },
]

dependencies = ["pandas", "numpy", "requests"]

[project.optional-dependencies]
code_qa = ["black", "isort", "ruff", "mypy", "yamllint", "nbqa", "pre-commit"]
code_qa = [
"black",
"isort",
"ruff",
"mypy",
"yamllint",
"nbqa",
"pre-commit",
"types-PyYAML",
]
test = ["pytest", "pytest-cov"]
release = ["build", "twine"]

dev = ["modern_data_estate[code_qa, test, release]" , "ipykernel"]
dev = ["modern_data_estate[code_qa, test, release]", "ipykernel"]
ci = ["modern_data_estate[test]", "pre-commit"]
cd = ["modern_data_estate[release]"]

@@ -46,9 +57,9 @@ exclude = [
"*.ipynb",
"archive/*",
"e2e_samples/parking_sensors/*",
"single_tech_samples/*"
"single_tech_samples/*",
]
builtins = ["mssparkutils", "spark", "display"]
builtins = ["notebookutils", "mssparkutils", "spark", "display"]


[tool.nbqa]