6. How do you manage datasets in Azure ML?
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
# Create URI File dataset (single file)
my_data = Data(
path="https://storage.blob.core.windows.net/data/train.csv",
type=AssetTypes.URI_FILE,
name="training-data",
description="Training dataset",
version="1"
)
ml_client.data.create_or_update(my_data)
# Create URI Folder dataset (directory)
folder_data = Data(
path="azureml://datastores/blob_store/paths/images/",
type=AssetTypes.URI_FOLDER,
name="image-data"
)
# Create MLTable (schema-aware)
from azure.ai.ml.entities import Data
mltable_data = Data(
path="./data/mltable_folder",
type=AssetTypes.MLTABLE,
name="structured-data"
)
# Access in training script
import argparse
import mltable
parser = argparse.ArgumentParser()
parser.add_argument("--input_data", type=str)
args = parser.parse_args()
tbl = mltable.load(args.input_data)
df = tbl.to_pandas_dataframe()
Datastores:
- Azure Blob Storage
- Azure Data Lake Gen2
- Azure Files
- Azure SQL Database
7. What are Azure ML Pipelines?
Azure ML Pipelines are reusable workflows for ML tasks that can be scheduled or triggered.
from azure.ai.ml import command, Input, Output
from azure.ai.ml.dsl import pipeline
# Define components
prep_data = command(
name="prep_data",
display_name="Prepare Data",
inputs={"raw_data": Input(type="uri_folder")},
outputs={"processed_data": Output(type="uri_folder")},
code="./src/prep",
command="python prep.py --raw_data [null] --output [null]",
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest"
)
train_model = command(
name="train_model",
inputs={
"training_data": Input(type="uri_folder"),
"learning_rate": 0.01
},
outputs={"model": Output(type="mlflow_model")},
code="./src/train",
command="python train.py --data [null] --lr [null] --model_output [null]",
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
compute="training-cluster"
)
# Build pipeline
@pipeline(default_compute="training-cluster")
def training_pipeline(raw_data):
prep_step = prep_data(raw_data=raw_data)
train_step = train_model(training_data=prep_step.outputs.processed_data)
return {"model": train_step.outputs.model}
# Submit pipeline
pipeline_job = training_pipeline(raw_data=Input(type="uri_folder", path="azureml:raw-data:1"))
returned_job = ml_client.jobs.create_or_update(pipeline_job)
8. How do you deploy models in Azure ML?
Deployment Options:
| Option | Use Case | Features |
| Managed Online Endpoints | Real-time inference | Auto-scale, blue-green, managed |
| Kubernetes Endpoints | Real-time, custom infra | Use existing AKS |
| Batch Endpoints | Large-scale batch | Parallel processing |
| Azure IoT Edge | Edge deployment | Low latency, offline |
from azure.ai.ml.entities import (
ManagedOnlineEndpoint,
ManagedOnlineDeployment,
Model,
Environment,
CodeConfiguration
)
# Create endpoint
endpoint = ManagedOnlineEndpoint(
name="churn-endpoint",
auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint)
# Create deployment
blue_deployment = ManagedOnlineDeployment(
name="blue",
endpoint_name="churn-endpoint",
model=Model(path="./model"),
code_configuration=CodeConfiguration(
code="./src/score",
scoring_script="score.py"
),
environment=Environment(
conda_file="./env/conda.yml",
image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest"
),
instance_type="Standard_DS3_v2",
instance_count=2
)
ml_client.online_deployments.begin_create_or_update(blue_deployment)
# Set traffic
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint)
9. What is the Azure ML SDK?
SDK v2 (Current - Recommended):
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
# Connect to workspace
ml_client = MLClient(
credential=DefaultAzureCredential(),
subscription_id="xxx",
resource_group_name="myRG",
workspace_name="myWorkspace"
)
# List computes
for compute in ml_client.compute.list():
print(compute.name)
# Get model
model = ml_client.models.get(name="my-model", version="1")
# Submit job
from azure.ai.ml import command
job = command(
code="./src",
command="python train.py --data [null]",
inputs={"data": Input(type="uri_folder", path="azureml:my-data:1")},
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
compute="training-cluster"
)
returned_job = ml_client.jobs.create_or_update(job)
SDK v1 (Legacy):
from azureml.core import Workspace, Experiment, Run
ws = Workspace.from_config()
experiment = Experiment(workspace=ws, name='my-experiment')
run = experiment.start_logging()
10. What are Environments in Azure ML?
Environments define the software dependencies for training and inference.
from azure.ai.ml.entities import Environment
# Create from conda file
env = Environment(
name="my-sklearn-env",
description="Scikit-learn environment",
conda_file="./env/conda.yml",
image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest"
)
ml_client.environments.create_or_update(env)
# conda.yml
name: sklearn-env
channels:
- conda-forge
dependencies:
- python=3.9
- scikit-learn=1.2
- pandas
- pip:
- mlflow
- azureml-mlflow
# Create from Dockerfile
env_docker = Environment(
name="custom-env",
build=BuildContext(
path="./docker",
dockerfile_path="Dockerfile"
)
)
# Use curated environment
job = command(
...
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest"
)
11. What is MLflow integration in Azure ML?
Azure ML natively supports MLflow for experiment tracking, model management, and deployment.
import mlflow
from mlflow.models import infer_signature
# Auto-configure MLflow tracking
mlflow.set_tracking_uri(ml_client.tracking_uri)
# Set experiment
mlflow.set_experiment("churn-prediction")
with mlflow.start_run() as run:
# Log parameters
mlflow.log_param("learning_rate", 0.01)
mlflow.log_param("n_estimators", 100)
# Train model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
# Log metrics
accuracy = model.score(X_test, y_test)
mlflow.log_metric("accuracy", accuracy)
# Log model with signature
signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, "model", signature=signature)
# Log artifacts
mlflow.log_artifact("./feature_importance.png")
# Register model from run
model_uri = f"runs:/{run.info.run_id}/model"
mlflow.register_model(model_uri, "churn-model")
MLflow Benefits:
- Framework-agnostic tracking
- Automatic model serialization
- Model signature enforcement
- Easy deployment with MLflow models
12. How do you implement MLOps in Azure ML?
# MLOps with Azure DevOps/GitHub Actions
# 1. Training Pipeline (Azure Pipelines YAML)
trigger:
branches:
include:
- main
paths:
include:
- src/training/*
stages:
- stage: Train
jobs:
- job: RunTraining
steps:
- task: AzureCLI@2
inputs:
azureSubscription: 'Azure Connection'
scriptType: 'bash'
scriptLocation: 'inlineScript'
inlineScript: |
az ml job create --file train-job.yml --resource-group myRG --workspace-name myWS
- stage: Register
condition: succeeded('Train')
jobs:
- job: RegisterModel
steps:
- task: AzureCLI@2
inputs:
inlineScript: |
az ml model create --name my-model --version $(Build.BuildId) --path azureml://jobs/$JOB_NAME/outputs/model
- stage: Deploy
condition: succeeded('Register')
jobs:
- deployment: DeployToStaging
environment: staging
strategy:
runOnce:
deploy:
steps:
- task: AzureCLI@2
inputs:
inlineScript: |
az ml online-deployment update --name green --endpoint my-endpoint --set traffic=100
MLOps Components:
- Source control for code and configs
- Automated training pipelines
- Model versioning and registry
- Automated testing
- Blue-green deployments
- Model monitoring
13. What is Responsible AI in Azure ML?
Responsible AI tools help understand, protect, and control ML models.
Components:
-
Model Interpretability: Understand model predictions
-
Fairness: Detect and mitigate bias
-
Error Analysis: Identify failure patterns
-
Counterfactuals: What-if analysis
-
Data Balance: Dataset analysis
from raiwidgets import ResponsibleAIDashboard
from responsibleai import RAIInsights
# Create RAI insights
rai_insights = RAIInsights(model, train_df, test_df, target_column, 'classification')
# Add components
rai_insights.explainer.add()
rai_insights.error_analysis.add()
rai_insights.counterfactual.add(total_CFs=10)
rai_insights.causal.add(treatment_features=['feature1'])
# Compute insights
rai_insights.compute()
# View dashboard
ResponsibleAIDashboard(rai_insights)
# In Azure ML
from azure.ai.ml.entities import ResponsibleAIInsights
rai_job = ResponsibleAIInsights(
target_column_name="target",
training_data=train_data,
test_data=test_data,
task_type="classification",
components=["explainer", "error_analysis", "fairness"]
)
14. How do you handle data labeling in Azure ML?
Azure ML provides built-in data labeling for creating training datasets.
Supported Tasks:
- Image classification (single/multi-label)
- Object detection (bounding boxes)
- Instance segmentation
- Text classification
- NER (Named Entity Recognition)
Features:
# Data Labeling capabilities:
1. ML-assisted labeling (pre-labeling)
2. Human-in-the-loop validation
3. Labeler management and assignment
4. Quality control and review
5. Export to various formats
# Create labeling project via Azure Portal:
1. Navigate to Data Labeling in workspace
2. Create new project
3. Select task type
4. Configure label classes
5. Upload data
6. Assign labelers
7. Monitor progress
# Export labeled data
from azure.ai.ml import MLClient
labeled_data = ml_client.data.get(name="labeled-images", version="1")
15. What are Managed Online Endpoints?
Managed Online Endpoints are fully managed real-time inference endpoints with auto-scaling, monitoring, and blue-green deployments.
# Create endpoint
endpoint = ManagedOnlineEndpoint(
name="my-endpoint",
auth_mode="key", # or "aml_token"
tags={"env": "production"}
)
ml_client.online_endpoints.begin_create_or_update(endpoint)
# Create deployment with auto-scale
deployment = ManagedOnlineDeployment(
name="production",
endpoint_name="my-endpoint",
model="azureml:my-model:1",
instance_type="Standard_DS3_v2",
instance_count=2,
scale_settings=DefaultScaleSettings(
scale_type="TargetUtilization",
min_instances=1,
max_instances=10,
target_utilization_percentage=70
),
request_settings=OnlineRequestSettings(
request_timeout_ms=60000,
max_concurrent_requests_per_instance=100
),
liveness_probe=ProbeSettings(
initial_delay=30,
period=10
)
)
ml_client.online_deployments.begin_create_or_update(deployment)
# Invoke endpoint
import urllib.request
import json
scoring_uri = ml_client.online_endpoints.get("my-endpoint").scoring_uri
api_key = ml_client.online_endpoints.get_keys("my-endpoint").primary_key
data = {"data": [[1, 2, 3, 4]]}
body = json.dumps(data).encode('utf-8')
req = urllib.request.Request(scoring_uri, body, {'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}'})
response = urllib.request.urlopen(req)
print(response.read())