Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 333 additions & 0 deletions GPR_library.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
# Distributed under the MIT License.
# See LICENSE.txt for details.

# Gaussian Process Regression Machine Learning Function Library
# contains all functions necessary to run the GPR Model
# used to predict better low-eccentricity orbital parameter initial guesses
# functions:
# 1. normalize_data
# 2. denormalize_predictions
# 3. omega_and_adot
# 4. omegaAndAdot
# 5. polynomial_fit_with_confidence
# 6. GPRegressionModel (class)
# 7. train_gpr_model
# 8. predict_with_gpr_model
# 9. run_gpr_pipeline
# 10. train_model_and_eigenvalue_analysis
# 11. loo_predictions
# 12. parse_test_runs
# 13. apply_gpr_corrections
# 14. save_gpr_corrected
# 15. loo_crossval
# 16. plot_loo_residuals
Comment on lines +5 to +23
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function list in the header comment (lines 7-24) lists 16 functions, but only 4 are actually present in this file (GPRegressionModel class, train_gpr_model, predict_with_gpr_model, run_gpr_pipeline). This documentation is misleading and should be updated to reflect only the functions that are actually implemented.

Suggested change
# contains all functions necessary to run the GPR Model
# used to predict better low-eccentricity orbital parameter initial guesses
# functions:
# 1. normalize_data
# 2. denormalize_predictions
# 3. omega_and_adot
# 4. omegaAndAdot
# 5. polynomial_fit_with_confidence
# 6. GPRegressionModel (class)
# 7. train_gpr_model
# 8. predict_with_gpr_model
# 9. run_gpr_pipeline
# 10. train_model_and_eigenvalue_analysis
# 11. loo_predictions
# 12. parse_test_runs
# 13. apply_gpr_corrections
# 14. save_gpr_corrected
# 15. loo_crossval
# 16. plot_loo_residuals
# Contains functions and classes necessary to run the GPR Model
# used to predict better low-eccentricity orbital parameter initial guesses
# Implemented functions/classes:
# - GPRegressionModel (class)
# - train_gpr_model
# - predict_with_gpr_model
# - run_gpr_pipeline

Copilot uses AI. Check for mistakes.

import argparse

import gpytorch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'pd' is not used.

Suggested change
import pandas as pd

Copilot uses AI. Check for mistakes.

# Imports
import torch


### ML functions
# ExactGP uses an infinite number of basis functions; GP is non-parametric and models functions globally;
# limited only by training points
class GPRegressionModel(gpytorch.models.ExactGP):
"""
Exact GP with a mixture of RBF and Matern kernels, a linear mean function,
and normalization capabilities for inputs and outputs.

Args:
train_x (torch.Tensor): Training input data
train_y (torch.Tensor): Training targets
likelihood (gpytorch.likelihoods.GaussianLikelihood): Likelihood for the model
"""

def __init__(self, train_x, train_y, likelihood):
super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)

# Supports all dimensions (ie GPR can be run from 1-8 dimensions)
input_dim = train_x.shape[1] if train_x.dim() > 1 else 1

# Define base kernels - use a mixture of the RBF and Matern kernels
self.rbf_kernel = gpytorch.kernels.RBFKernel(ard_num_dims=input_dim)
self.matern_kernel = gpytorch.kernels.MaternKernel(
nu=2.5, ard_num_dims=input_dim
)

# Wrap each kernel with a scale kernel - introduces learnable scaling factor
self.scaled_rbf = gpytorch.kernels.ScaleKernel(self.rbf_kernel)
self.scaled_matern = gpytorch.kernels.ScaleKernel(self.matern_kernel)

# Combine kernels - the sum of the kernels allows the model to capture more complex
# behavior than either kernel alone would
self.covar_module = self.scaled_rbf + self.scaled_matern

# Mean function - use linear mean instead of default 0 mean
# Remove hardcoding of model to expect 1D inputs and therefore, matrix mismatch if you pass 2D inputs
self.mean_module = gpytorch.means.LinearMean(input_size=input_dim)

# Normalization parameters - store the mean and std of the inputs and outputs
self.input_mean = None
self.input_std = None
self.output_mean = None
self.output_std = None

def set_normalization(self, input_mean, input_std, output_mean, output_std):
"""
Store normalization parameters in the model.
"""
self.input_mean = input_mean
self.input_std = input_std
self.output_mean = output_mean
self.output_std = output_std

def normalize_input(self, X):
"""
Normalize input using stored parameters. Scale input to zero mean and unit variance.
"""
return (X - self.input_mean) / self.input_std

def denormalize_output(self, Y_normalized):
"""
Denormalize output using stored parameters (converts normalized output back to original scale).
"""
return (Y_normalized * self.output_std) + self.output_mean

def forward(self, x, normalize_input=False):
"""
Forward pass with optional input normalization.

Args:
x (torch.Tensor): Input data
normalize_input (bool): If True, normalize x using stored parameters.

Returns:
gpytorch.distributions.MultivariateNormal: Distribution for the input.
"""
if normalize_input:
x = self.normalize_input(x)
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


# GPR training function
def train_gpr_model(raw_X, raw_Y):
"""
Train a GPR model with normalization parameters stored in the model.

Args:
raw_X (numpy.ndarray): Raw input data
raw_Y (numpy.ndarray): Raw output data

Returns:
GPRegressionModel: Trained model with the normalization parameters stored
gpytorch.likelihoods.GaussianLikelihood: Likelihood for the model
"""
# Compute normalization parameters
input_mean = raw_X.mean(
axis=0
) # (D, ) needed for multidimensions to avoid computing a single scalar mean for all columns combined
input_std = raw_X.std(
axis=0
) # (D, ) needed for multidimensions to avoid computing a single scalar mean for all columns combined
output_mean = raw_Y.mean()
output_std = raw_Y.std()

# Normalize data column-wise; needed for all dimension > 1
normalized_X = (raw_X - input_mean) / input_std
normalized_Y = (raw_Y - output_mean) / output_std

# Ensure X is proper dimension
if normalized_X.ndim == 1:
normalized_X = normalized_X.reshape(-1, 1)

# Convert to PyTorch tensors
train_X = torch.from_numpy(normalized_X).float()
train_Y = torch.from_numpy(normalized_Y).float()

# Define the likelihood and the model
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPRegressionModel(train_X, train_Y, likelihood)

# Store normalization parameters in the model
model.set_normalization(input_mean, input_std, output_mean, output_std)

# Set model to training mode
model.train()
likelihood.train()

# Use Adam optimizer with learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The learning rate value 0.05 is a magic number that should be extracted as a named constant or parameter for better maintainability and easier tuning.

Copilot uses AI. Check for mistakes.

# Add learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
mode="min",
factor=0.5, # Reduce LR by a factor of 0.5 when triggered
patience=5, # Wait 5 epochs before reducing LR if there is no improvement
)

# Loss function
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

# Track the best model to prevent overfitting
best_loss = float("inf") # Initialize best loss as infinity
best_state = None # Placeholder for best model state

# Training loop for 200 iterations
for i in range(200):
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The training iteration count 200 is a magic number that should be extracted as a named constant or parameter for better maintainability and to allow easier experimentation with training duration.

Copilot uses AI. Check for mistakes.
# Ensure model is in training mode at the start of each iteration
model.train()
likelihood.train()

# Reset gradients from the previous iteration
optimizer.zero_grad()

# Compute model output
output = model(train_X)

# Compute negative log-likelihood loss
loss = -mll(output, train_Y)

# Backpropagation: compute gradients of loss wrt model parameters
loss.backward()

# Update model parameters
optimizer.step()

# Update and adjust the learning rate
scheduler.step(loss)

# Save the best model parameters (if current loss is the lowest)
if loss.item() < best_loss:
best_loss = loss.item()
best_state = model.state_dict().copy()

# Load the best model state after training
if best_state is not None:
model.load_state_dict(best_state)

return model, likelihood


# GPR prediction function
def predict_with_gpr_model(raw_X, model, likelihood):
"""
Predict using the GPR model with stored normalization parameters.

Args:
raw_X (numpy.ndarray): Raw input data.
model (GPRegressionModel): Trained model.
likelihood (gpytorch.likelihoods.GaussianLikelihood): Likelihood
for the model.

Returns:
numpy.ndarray: Predicted mean (denormalized).
numpy.ndarray: Predicted standard deviation (denormalized).
"""
# Normalize the input using the model's stored parameters
normalized_X = (raw_X - model.input_mean) / model.input_std
X_tensor = torch.from_numpy(normalized_X).float()

# Set the model and likelihood to evaluation mode
model.eval()
likelihood.eval()

# Make predictions
with torch.no_grad():
observed_pred = likelihood(model(X_tensor))

# Denormalize the predictions
mean_normalized = observed_pred.mean.numpy()
stddev_normalized = observed_pred.variance.sqrt().numpy()

mean_denormalized = model.denormalize_output(mean_normalized)
stddev_denormalized = stddev_normalized * model.output_std

return mean_denormalized, stddev_denormalized


# GPR pipeline function - runs the entire process - including training, predicting, plotting - and outputs performance metrics
# This function encompasses previous functions defined above: train_gpr_model and predict_with_gpr_model and runs them together
def run_gpr_pipeline(X, Y, target_name="target", plot=True, silent=False):
"""
Train a GPR model on (X, Y), predict on X, plot, and report metrics for a given target.

Args:
X (np.ndarray): Input data.
Y (np.ndarray): Target output deltas.
target_name (str): For labeling plots & output.
plot: whether to produce correlation plots.
silent: whether to suppress print statements entirely.
Comment on lines +266 to +267
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The plot parameter in the docstring is missing a type annotation. Should be plot (bool): Whether to produce correlation plots. for consistency with other parameters.

Suggested change
plot: whether to produce correlation plots.
silent: whether to suppress print statements entirely.
plot (bool): Whether to produce correlation plots.
silent (bool): Whether to suppress print statements entirely.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The silent parameter in the docstring is missing a type annotation. Should be silent (bool): Whether to suppress print statements entirely. for consistency with other parameters.

Suggested change
silent: whether to suppress print statements entirely.
silent (bool): Whether to suppress print statements entirely.

Copilot uses AI. Check for mistakes.

Returns:
model
likelihood
Y_pred
uncertainties
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring states the function returns 4 values (model, likelihood, Y_pred, uncertainties), but the actual return statement on line 310 only returns 3 values (model, likelihood, Y_pred). The docstring should be corrected to match the implementation.

Suggested change
uncertainties

Copilot uses AI. Check for mistakes.
"""

# Train GPR model
model, likelihood = train_gpr_model(X, Y)

# Make predictions
Y_pred, uncertainties = predict_with_gpr_model(X, model, likelihood)

# If specified, create correlation plot and compute metrics
if plot:
plt.figure(figsize=(8, 6))
plt.scatter(Y, Y_pred, alpha=0.6, s=20)

# Make perfect correlation line (y = x)
min_val = min(Y.min(), Y_pred.min())
max_val = max(Y.max(), Y_pred.max())
plt.plot(
[min_val, max_val],
[min_val, max_val],
"r--",
lw=2,
label="Perfect Correlation",
)

# Labels and formatting
plt.xlabel(f"ΔTrue {target_name}", fontsize=12)
plt.ylabel(f"GPR Predicted Δ{target_name}", fontsize=12)
plt.title(
f"GPR Predictions vs True Values ({target_name})", fontsize=14
)
plt.grid(True, alpha=0.3)
plt.legend()

# Calculate and display metrics
corr = np.corrcoef(Y, Y_pred)[0, 1]
r2 = corr**2
rmse = np.sqrt(np.mean((Y - Y_pred) ** 2))
mae = np.mean(np.abs(Y - Y_pred))
metrics_text = f"R² = {r2:.4f}\nRMSE = {rmse:.8f}\nMAE = {mae:.8f}"
plt.text(
0.95,
0.05,
metrics_text,
transform=plt.gca().transAxes,
fontsize=12,
bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
verticalalignment="bottom",
horizontalalignment="right",
)

plt.tight_layout()
plt.show()

if not silent:
# Print performance metrics regardless of plotting
print(f"R²: goal: > 0.95 excellent, > 0.90 good, < 0.70 poor")
print(f"RMSE goal: < 1 % of target range, lower is better")
print(f"MAE goal: < 1 % of target range, lower is better")

return model, likelihood, Y_pred
Loading