Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .azdo/pipelines/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ steps:
AZURE_LOCATION: $(AZURE_LOCATION)
AZD_INITIAL_ENVIRONMENT_CONFIG: $(AZD_INITIAL_ENVIRONMENT_CONFIG)
AZURE_OPENAI_SERVICE: $(AZURE_OPENAI_SERVICE)
AZURE_OPENAI_API_VERSION: $(AZURE_OPENAI_API_VERSION)
AZURE_OPENAI_LOCATION: $(AZURE_OPENAI_LOCATION)
AZURE_OPENAI_RESOURCE_GROUP: $(AZURE_OPENAI_RESOURCE_GROUP)
AZURE_DOCUMENTINTELLIGENCE_SERVICE: $(AZURE_DOCUMENTINTELLIGENCE_SERVICE)
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ jobs:
# project specific
AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }}
AZURE_OPENAI_LOCATION: ${{ vars.AZURE_OPENAI_LOCATION }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }}
AZURE_DOCUMENTINTELLIGENCE_SERVICE: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SERVICE }}
AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }}
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/evaluate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ jobs:
# project specific
AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }}
AZURE_OPENAI_LOCATION: ${{ vars.AZURE_OPENAI_LOCATION }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }}
AZURE_DOCUMENTINTELLIGENCE_SERVICE: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SERVICE }}
AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }}
Expand Down
22 changes: 9 additions & 13 deletions app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@
from error import error_dict, error_response
from prepdocs import (
OpenAIHost,
clean_key_if_exists,
setup_embeddings_service,
setup_file_processors,
setup_image_embeddings_service,
Expand Down Expand Up @@ -426,8 +425,11 @@ async def setup_clients():
os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") if OPENAI_HOST in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM] else None
)
AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL")
# https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-10-21"
AZURE_OPENAI_ENDPOINT = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot Remove this line that sets AZURE_OPENAI_ENDPOINT, and pass azure_openai_endpoint from setup_openai_client to setup_embeddings instead

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in commit ee14c51 - removed the redundant AZURE_OPENAI_ENDPOINT computation and now pass the azure_openai_endpoint value returned from setup_openai_client directly to setup_embeddings_service.

os.getenv("AZURE_OPENAI_ENDPOINT")
or (AZURE_OPENAI_CUSTOM_URL if OPENAI_HOST == OpenAIHost.AZURE_CUSTOM else None)
or (f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com" if AZURE_OPENAI_SERVICE else None)
)
AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "")
AZURE_OPENAI_API_KEY_OVERRIDE = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE")
# Used only with non-Azure OpenAI deployments
Expand Down Expand Up @@ -561,7 +563,6 @@ async def setup_clients():
openai_client = setup_openai_client(
openai_host=OPENAI_HOST,
azure_credential=azure_credential,
azure_openai_api_version=AZURE_OPENAI_API_VERSION,
azure_openai_service=AZURE_OPENAI_SERVICE,
azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL,
azure_openai_api_key=AZURE_OPENAI_API_KEY_OVERRIDE,
Expand Down Expand Up @@ -602,17 +603,12 @@ async def setup_clients():
search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential
)
text_embeddings_service = setup_embeddings_service(
azure_credential=azure_credential,
openai_host=OpenAIHost(OPENAI_HOST),
open_ai_client=openai_client,
openai_host=OPENAI_HOST,
emb_model_name=OPENAI_EMB_MODEL,
emb_model_dimensions=OPENAI_EMB_DIMENSIONS,
azure_openai_service=AZURE_OPENAI_SERVICE,
azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL,
azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
azure_openai_api_version=AZURE_OPENAI_API_VERSION,
azure_openai_key=clean_key_if_exists(AZURE_OPENAI_API_KEY_OVERRIDE),
openai_key=clean_key_if_exists(OPENAI_API_KEY),
openai_org=OPENAI_ORGANIZATION,
azure_openai_endpoint=AZURE_OPENAI_ENDPOINT,
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
)
image_embeddings_service = setup_image_embeddings_service(
Expand Down Expand Up @@ -754,7 +750,7 @@ def create_app():

# Log levels should be one of https://docs.python.org/3/library/logging.html#logging-levels
# Set root level to WARNING to avoid seeing overly verbose logs from SDKS
logging.basicConfig(level=logging.WARNING)
logging.basicConfig(level=logging.DEBUG)
# Set our own logger levels to INFO by default
app_level = os.getenv("APP_LOG_LEVEL", "INFO")
app.logger.setLevel(os.getenv("APP_LOG_LEVEL", app_level))
Expand Down
107 changes: 42 additions & 65 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,13 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
from openai import AsyncAzureOpenAI, AsyncOpenAI
from openai import AsyncOpenAI
from rich.logging import RichHandler

from load_azd_env import load_azd_env
from prepdocslib.blobmanager import BlobManager
from prepdocslib.csvparser import CsvParser
from prepdocslib.embeddings import (
AzureOpenAIEmbeddingService,
ImageEmbeddings,
OpenAIEmbeddingService,
)
from prepdocslib.embeddings import ImageEmbeddings, OpenAIEmbeddings
from prepdocslib.fileprocessor import FileProcessor
from prepdocslib.filestrategy import FileStrategy
from prepdocslib.htmlparser import LocalHTMLParser
Expand Down Expand Up @@ -160,55 +156,43 @@ class OpenAIHost(str, Enum):


def setup_embeddings_service(
azure_credential: AsyncTokenCredential,
open_ai_client: AsyncOpenAI,
openai_host: OpenAIHost,
emb_model_name: str,
emb_model_dimensions: int,
azure_openai_service: Optional[str],
azure_openai_custom_url: Optional[str],
azure_openai_deployment: Optional[str],
azure_openai_key: Optional[str],
azure_openai_api_version: str,
openai_key: Optional[str],
openai_org: Optional[str],
azure_openai_deployment: str | None,
azure_openai_endpoint: str | None,
disable_vectors: bool = False,
disable_batch_vectors: bool = False,
):
if disable_vectors:
logger.info("Not setting up embeddings service")
return None

azure_endpoint = None
azure_deployment = None
if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]:
azure_open_ai_credential: AsyncTokenCredential | AzureKeyCredential = (
azure_credential if azure_openai_key is None else AzureKeyCredential(azure_openai_key)
)
return AzureOpenAIEmbeddingService(
open_ai_service=azure_openai_service,
open_ai_custom_url=azure_openai_custom_url,
open_ai_deployment=azure_openai_deployment,
open_ai_model_name=emb_model_name,
open_ai_dimensions=emb_model_dimensions,
open_ai_api_version=azure_openai_api_version,
credential=azure_open_ai_credential,
disable_batch=disable_batch_vectors,
)
else:
if openai_key is None:
raise ValueError("OpenAI key is required when using the non-Azure OpenAI API")
return OpenAIEmbeddingService(
open_ai_model_name=emb_model_name,
open_ai_dimensions=emb_model_dimensions,
credential=openai_key,
organization=openai_org,
disable_batch=disable_batch_vectors,
)
if azure_openai_endpoint is None:
raise ValueError("Azure OpenAI endpoint must be provided when using Azure OpenAI embeddings")
if azure_openai_deployment is None:
raise ValueError("Azure OpenAI deployment must be provided when using Azure OpenAI embeddings")
azure_endpoint = azure_openai_endpoint
azure_deployment = azure_openai_deployment

return OpenAIEmbeddings(
open_ai_client=open_ai_client,
open_ai_model_name=emb_model_name,
open_ai_dimensions=emb_model_dimensions,
disable_batch=disable_batch_vectors,
azure_deployment_name=azure_deployment,
azure_endpoint=azure_endpoint,
)


def setup_openai_client(
openai_host: OpenAIHost,
azure_credential: AsyncTokenCredential,
azure_openai_api_key: Optional[str] = None,
azure_openai_api_version: Optional[str] = None,
azure_openai_service: Optional[str] = None,
azure_openai_custom_url: Optional[str] = None,
openai_api_key: Optional[str] = None,
Expand All @@ -229,19 +213,16 @@ def setup_openai_client(
logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client")
if not azure_openai_service:
raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure")
endpoint = f"https://{azure_openai_service}.openai.azure.com"
endpoint = f"https://{azure_openai_service}.openai.azure.com/openai/v1"
if azure_openai_api_key:
logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client")
openai_client = AsyncAzureOpenAI(
api_version=azure_openai_api_version, azure_endpoint=endpoint, api_key=azure_openai_api_key
)
openai_client = AsyncOpenAI(base_url=endpoint, api_key=azure_openai_api_key)
else:
logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client")
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = AsyncAzureOpenAI(
api_version=azure_openai_api_version,
azure_endpoint=endpoint,
azure_ad_token_provider=token_provider,
openai_client = AsyncOpenAI(
base_url=endpoint,
api_key=token_provider,
)
elif openai_host == OpenAIHost.LOCAL:
logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key")
Expand Down Expand Up @@ -516,41 +497,37 @@ async def main(strategy: Strategy, setup_index: bool = True):
enable_global_documents=enable_global_documents,
)

# https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot Isn't AZURE_OPENAI_API_VERSION in other files as well, like the GitHub actions workflows?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're absolutely right! Found and removed AZURE_OPENAI_API_VERSION references from GitHub Actions workflows (.github/workflows/azure-dev.yml, .github/workflows/evaluate.yaml), Azure DevOps pipeline (.azdo/pipelines/azure-dev.yml), and documentation (docs/reasoning.md). The only remaining reference is in evals/generate_ground_truth.py which uses LangChain wrappers, not the OpenAI SDK directly. Commit: 5d23055

emb_model_dimensions = 1536
if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"):
emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"])
openai_embeddings_service = setup_embeddings_service(
azure_credential=azd_credential,
openai_host=OPENAI_HOST,
emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"],
emb_model_dimensions=emb_model_dimensions,
azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"),
azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"),
azure_openai_api_version=azure_openai_api_version,
azure_openai_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"),
openai_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")),
openai_org=os.getenv("OPENAI_ORGANIZATION"),
disable_vectors=dont_use_vectors,
disable_batch_vectors=args.disablebatchvectors,
)
openai_client = setup_openai_client(
openai_host=OPENAI_HOST,
azure_credential=azd_credential,
azure_openai_api_version=azure_openai_api_version,
azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"),
azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"),
openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")),
openai_organization=os.getenv("OPENAI_ORGANIZATION"),
)
azure_embedding_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("AZURE_OPENAI_CUSTOM_URL")
if not azure_embedding_endpoint and OPENAI_HOST == OpenAIHost.AZURE:
if service := os.getenv("AZURE_OPENAI_SERVICE"):
azure_embedding_endpoint = f"https://{service}.openai.azure.com"
openai_embeddings_service = setup_embeddings_service(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved after, since its now dependent on the client

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, and it's simpler at the end of the day

open_ai_client=openai_client,
openai_host=OPENAI_HOST,
emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"],
emb_model_dimensions=emb_model_dimensions,
azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"),
azure_openai_endpoint=azure_embedding_endpoint,
disable_vectors=dont_use_vectors,
disable_batch_vectors=args.disablebatchvectors,
)

ingestion_strategy: Strategy
if use_int_vectorization:

if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService):
if not openai_embeddings_service or OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]:
raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service")

ingestion_strategy = IntegratedVectorizerStrategy(
Expand Down
Loading
Loading