Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ log/*
logs/
parts/*
json_results/*
pdfs/
results/
81 changes: 76 additions & 5 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import math
import random
import re
import logging
from .utils import *
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand Down Expand Up @@ -322,10 +323,70 @@ def toc_transformer(toc_content, model=None):
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)


last_complete = json.loads(last_complete)

cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
return cleaned_response
# Use extract_json instead of json.loads for better error handling
try:
# First, try to get the JSON content properly
if isinstance(last_complete, str):
# Clean the response to extract just the JSON part
last_complete = last_complete.strip()

# Debug: log what we're trying to parse
logging.info(f"Attempting to parse JSON: {last_complete[:500]}...") # Log first 500 chars

# Try using extract_json first
last_complete_json = extract_json(last_complete)

if isinstance(last_complete_json, dict) and 'table_of_contents' in last_complete_json:
cleaned_response = convert_page_to_int(last_complete_json['table_of_contents'])
return cleaned_response
else:
logging.warning(f"extract_json returned unexpected format: {type(last_complete_json)}")
logging.warning(f"Keys available: {list(last_complete_json.keys()) if isinstance(last_complete_json, dict) else 'Not a dict'}")

# Fallback: try direct JSON parsing
last_complete_json = json.loads(last_complete)
if 'table_of_contents' in last_complete_json:
cleaned_response = convert_page_to_int(last_complete_json['table_of_contents'])
return cleaned_response
else:
logging.error(f"JSON doesn't contain 'table_of_contents' key. Available keys: {list(last_complete_json.keys())}")

except json.JSONDecodeError as e:
logging.error(f"JSON parsing error in toc_transformer: {e}")
logging.error(f"Problematic JSON content (first 1000 chars): {last_complete[:1000]}")

except Exception as e:
logging.error(f"Unexpected error in toc_transformer: {e}")
logging.error(f"Content type: {type(last_complete)}")

# Final fallback: try to extract JSON manually
try:
# Find the start and end of JSON structure
start_idx = last_complete.find('{')
if start_idx != -1:
# Find the matching closing brace
brace_count = 0
for i, char in enumerate(last_complete[start_idx:], start_idx):
if char == '{':
brace_count += 1
elif char == '}':
brace_count -= 1
if brace_count == 0:
clean_json = last_complete[start_idx:i+1]
logging.info(f"Manually extracted JSON: {clean_json}")
last_complete_json = json.loads(clean_json)
if 'table_of_contents' in last_complete_json:
cleaned_response = convert_page_to_int(last_complete_json['table_of_contents'])
return cleaned_response
else:
logging.error(f"Manually extracted JSON doesn't contain 'table_of_contents'. Keys: {list(last_complete_json.keys())}")

logging.error("Could not extract valid JSON with table_of_contents")
return []

except Exception as fallback_error:
logging.error(f"Fallback JSON parsing also failed: {fallback_error}")
return []



Expand Down Expand Up @@ -1058,6 +1119,10 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
def page_index_main(doc, opt=None):
logger = JsonLogger(doc)

# Set up cost tracking with logger
from .utils import set_global_logger
set_global_logger(logger)

is_valid_pdf = (
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
isinstance(doc, BytesIO)
Expand Down Expand Up @@ -1097,7 +1162,13 @@ async def page_index_builder():
'structure': structure,
}

return asyncio.run(page_index_builder())
result = asyncio.run(page_index_builder())

# Log final cost summary to file
from .utils import log_final_cost_summary
log_final_cost_summary()

return result


def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
Expand Down
212 changes: 193 additions & 19 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import tiktoken
import openai
from openai import AzureOpenAI
import logging
import os
from datetime import datetime
Expand All @@ -17,7 +18,132 @@
from pathlib import Path
from types import SimpleNamespace as config

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Load Azure OpenAI-specific configurations
OPENAI_API_TYPE = os.getenv("OPENAI_API_TYPE", "azure")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION")
OPENAI_API_ENGINE = os.getenv("OPENAI_API_ENGINE")

# Note: Using modern AzureOpenAI client instead of global configuration

# Global cost tracking
total_cost = 0.0
call_count = 0
global_logger = None # Add global logger reference

# Azure OpenAI pricing per 1K tokens (update these based on your specific pricing)
# These are example rates - adjust based on your actual Azure OpenAI pricing
PRICING = {
"gpt-4": {
"input": 0.03, # per 1K input tokens
"output": 0.06 # per 1K output tokens
},
"gpt-4-turbo": {
"input": 0.01,
"output": 0.03
},
"gpt-4o": {
"input": 0.005,
"output": 0.015
},
"gpt-4.1-nano": { # Add your specific model
"input": 0.005, # Adjust based on actual pricing
"output": 0.015 # Adjust based on actual pricing
},
"default": {
"input": 0.01,
"output": 0.03
}
}

def set_global_logger(logger):
"""Set the global logger for cost tracking"""
global global_logger
global_logger = logger

def calculate_cost(model_name, input_tokens, output_tokens):
"""Calculate cost based on token usage and model pricing"""
# Get pricing for the model or use default
model_pricing = PRICING.get(model_name, PRICING["default"])

input_cost = (input_tokens / 1000) * model_pricing["input"]
output_cost = (output_tokens / 1000) * model_pricing["output"]
total_cost = input_cost + output_cost

return total_cost, input_cost, output_cost

def log_cost(model_name, input_tokens, output_tokens, prompt_preview=""):
"""Log cost information for the API call"""
global total_cost, call_count, global_logger

call_cost, input_cost, output_cost = calculate_cost(model_name, input_tokens, output_tokens)
total_cost += call_cost
call_count += 1

# Create cost data structure
cost_data = {
"type": "api_cost",
"call_number": call_count,
"model": model_name,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"input_cost": round(input_cost, 6),
"output_cost": round(output_cost, 6),
"call_cost": round(call_cost, 6),
"running_total": round(total_cost, 6),
"prompt_preview": prompt_preview[:100] + "..." if len(prompt_preview) > 100 else prompt_preview,
"timestamp": datetime.now().isoformat()
}

# Console output
print(f"💰 API Call #{call_count} Cost:")
print(f" Model: {model_name}")
print(f" Input tokens: {input_tokens} (${input_cost:.4f})")
print(f" Output tokens: {output_tokens} (${output_cost:.4f})")
print(f" Call cost: ${call_cost:.4f}")
print(f" Running total: ${total_cost:.4f}")
if prompt_preview:
print(f" Prompt preview: {prompt_preview[:100]}...")
print("-" * 50)

# Log to file if logger is available
if global_logger:
global_logger.log("INFO", cost_data)

return call_cost

def log_final_cost_summary():
"""Log final cost summary to both console and file"""
global total_cost, call_count, global_logger

summary_data = {
"type": "cost_summary",
"total_api_calls": call_count,
"total_cost": round(total_cost, 6),
"average_cost_per_call": round(total_cost/call_count, 6) if call_count > 0 else 0,
"timestamp": datetime.now().isoformat()
}

# Console output
print("\n" + "="*60)
print("💰 FINAL COST SUMMARY")
print("="*60)
print(f"Total API calls: {call_count}")
print(f"Total cost: ${total_cost:.4f}")
print(f"Average cost per call: ${total_cost/call_count:.4f}" if call_count > 0 else "No API calls made")
print("="*60)

# Log to file if logger is available
if global_logger:
global_logger.log("INFO", summary_data)

return summary_data

def get_total_cost():
"""Get the total cost accumulated so far"""
return total_cost, call_count

def count_tokens(text, model=None):
if not text:
Expand All @@ -26,9 +152,9 @@ def count_tokens(text, model=None):
tokens = enc.encode(text)
return len(tokens)

def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
# Update the OpenAI client initialization to use modern Azure OpenAI client
def ChatGPT_API_with_finish_reason(model, prompt, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
for i in range(max_retries):
try:
if chat_history:
Expand All @@ -37,15 +163,32 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_
else:
messages = [{"role": "user", "content": prompt}]

# Count input tokens
input_text = prompt if not chat_history else "\n".join([msg.get("content", "") for msg in messages])
input_tokens = count_tokens(input_text, model)

client = AzureOpenAI(
api_key=OPENAI_API_KEY,
api_version=OPENAI_API_VERSION,
azure_endpoint=OPENAI_API_BASE
)
response = client.chat.completions.create(
model=model,
model=OPENAI_API_ENGINE,
messages=messages,
temperature=0,
)

# Get response content and count output tokens
response_content = response.choices[0].message.content
output_tokens = count_tokens(response_content, model)

# Log cost information
log_cost(OPENAI_API_ENGINE, input_tokens, output_tokens, prompt)

if response.choices[0].finish_reason == "length":
return response.choices[0].message.content, "max_output_reached"
return response_content, "max_output_reached"
else:
return response.choices[0].message.content, "finished"
return response_content, "finished"

except Exception as e:
print('************* Retrying *************')
Expand All @@ -54,13 +197,12 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_
time.sleep(1) # Wait for 1秒 before retrying
else:
logging.error('Max retries reached for prompt: ' + prompt)
return "Error"
return "Error", "error"



def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def ChatGPT_API(model, prompt, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
for i in range(max_retries):
try:
if chat_history:
Expand All @@ -69,13 +211,29 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
else:
messages = [{"role": "user", "content": prompt}]

# Count input tokens
input_text = prompt if not chat_history else "\n".join([msg.get("content", "") for msg in messages])
input_tokens = count_tokens(input_text, model)

client = AzureOpenAI(
api_key=OPENAI_API_KEY,
api_version=OPENAI_API_VERSION,
azure_endpoint=OPENAI_API_BASE
)
response = client.chat.completions.create(
model=model,
model=OPENAI_API_ENGINE,
messages=messages,
temperature=0,
)

# Get response content and count output tokens
response_content = response.choices[0].message.content
output_tokens = count_tokens(response_content, model)

# Log cost information
log_cost(OPENAI_API_ENGINE, input_tokens, output_tokens, prompt)

return response.choices[0].message.content
return response_content
except Exception as e:
print('************* Retrying *************')
logging.error(f"Error: {e}")
Expand All @@ -86,18 +244,34 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
return "Error"


async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
async def ChatGPT_API_async(model, prompt, api_key=OPENAI_API_KEY):
max_retries = 10
messages = [{"role": "user", "content": prompt}]
for i in range(max_retries):
try:
async with openai.AsyncOpenAI(api_key=api_key) as client:
response = await client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
return response.choices[0].message.content
# Count input tokens
input_tokens = count_tokens(prompt, model)

# For Azure OpenAI with the new client
client = AzureOpenAI(
api_key=api_key,
api_version=OPENAI_API_VERSION,
azure_endpoint=OPENAI_API_BASE
)
response = client.chat.completions.create(
model=OPENAI_API_ENGINE,
messages=messages,
temperature=0,
)

# Get response content and count output tokens
response_content = response.choices[0].message.content
output_tokens = count_tokens(response_content, model)

# Log cost information
log_cost(OPENAI_API_ENGINE, input_tokens, output_tokens, prompt)

return response_content
except Exception as e:
print('************* Retrying *************')
logging.error(f"Error: {e}")
Expand Down
Loading