Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ jobs:
*.cache-to=type=gha,mode=max
env:
OPENTOPOGRAPHY_API_KEY: ${{ secrets.OPENTOPOGRAPHY_API_KEY }}
CACHEBUST: ${{ github.sha }}

- name: Start services
run: docker compose up -d --wait
Expand Down
7 changes: 6 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,18 @@ COPY . .
# Train the yield prediction model at build time so it is always embedded in
# the image and stays in sync with the code/data. SECRET_KEY is a throwaway
# value used only to satisfy django.setup(); no database is needed.
# The training script enforces a minimum R-squared threshold and will exit
# non-zero if the model does not meet the required accuracy.
# CACHEBUST: pass a unique value (e.g. git SHA, timestamp) to force
# retraining even when no source files have changed.
ARG OPENTOPOGRAPHY_API_KEY=keykey
ARG CACHEBUST
RUN cd YieldPredictionModel \
&& SECRET_KEY=build-only-key \
OPENTOPOGRAPHY_API_KEY=${OPENTOPOGRAPHY_API_KEY} \
python -u CreateAndTrainYieldCalculatorModel.py \
&& test -f Models/yield_model.keras \
|| { echo "ERROR: Model training failed - yield_model.keras not created"; exit 1; }
|| { echo "ERROR: Model training failed - check accuracy report above for details"; exit 1; }

# Drop root
RUN useradd -m appuser && chown -R appuser /app
Expand Down
98 changes: 48 additions & 50 deletions backend/YieldPredictionModel/CreateAndTrainYieldCalculatorModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
from helpers import Create_Model, encode, save_to_csv
from core.services import create_charai_data

# Minimum R-squared the trained model must achieve on the held-out test set.
# If the model scores below this value, the script exits with a non-zero code
# so that Docker builds and CI pipelines fail early with a clear error.
# Adjust this threshold as training data or model architecture improves.
MIN_R2_THRESHOLD = 0.0

harvest_file_name_total = "./Data/CookHandHarvest_HY1999-HY2016_P3A3_20241029(in).csv"
tiff_file_path = "./Data/tiff.tif"

Expand All @@ -41,24 +47,33 @@
)

# ---------- Cook Harvest Prep ----------
logger.info("getting harvest")
logger.info("Loading harvest data")
harvest = pd.read_csv(harvest_file_name_total)
logger.info("Raw harvest rows: %d, columns: %d", *harvest.shape)

# print(harvest["Crop"].unique())

# drop any columns that are more than 1000 missing values
# Drop columns where more than 1000 values are missing -- these columns
# are too sparse to be useful for training.
harvest = harvest.loc[:, harvest.isna().sum() <= 1000]

# Drop unnecessary "SampleID" Column
# Drop unnecessary "SampleID" column
harvest.drop(columns=["SampleID"], inplace=True)

# Drop rows with missing "Crop" or "GrainYieldAirDry" Values
# Drop rows with any remaining missing values (Crop, GrainYieldAirDry, etc.)
harvest = harvest.dropna()
harvest.isna().sum()

# Drop columns: QCCoverage, QCFlags, CropExists
# Drop metadata columns not used for training
harvest.drop(columns=["QCCoverage", "QCFlags", "CropExists", "ID2", "HarvestYear"], inplace=True)

# Remove rows where the crop failed or was not harvested (zero yield).
# These represent planting failures, not valid yield observations, and
# would bias the model toward predicting lower yields.
zero_yield_count = (harvest["GrainYieldAirDry"] <= 0).sum()
if zero_yield_count > 0:
logger.info("Removing %d rows with zero/negative yield (failed crops)", zero_yield_count)
harvest = harvest[harvest["GrainYieldAirDry"] > 0]

logger.info("Cleaned harvest rows: %d", len(harvest))

# ---------- Get CharAI Generated Data ----------

charai = create_charai_data(logger, cook_farm_coords, tiff_file_path)
Expand Down Expand Up @@ -146,50 +161,33 @@
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# date = date
# time = time
# ---------- Accuracy Report ----------
logger.info("--- Model Accuracy Report ---")
logger.info(" Test Loss (MSE) : %.4f", test_loss)
logger.info(" Test MAE : %.4f", test_mae)
logger.info(" RMSE : %.4f", rmse)
logger.info(" R-squared (R2) : %.4f", r2)
logger.info(" Min R2 Threshold: %.4f", MIN_R2_THRESHOLD)
logger.info(" Training rows : %d", len(X_train))
logger.info(" Test rows : %d", len(X_test))
logger.info(" Features : %s", ", ".join(YieldCalculator.MODEL_FEATURE_COLUMNS))
logger.info("--- End Accuracy Report ---")

# ---------- Accuracy Gate ----------
if r2 < MIN_R2_THRESHOLD:
logger.error(
"ACCURACY CHECK FAILED: R2=%.4f is below the minimum threshold of %.4f. "
"The model does not meet the required accuracy for deployment. "
"Review training data, feature engineering, or model architecture. "
"To adjust the threshold, update MIN_R2_THRESHOLD in this script.",
r2,
MIN_R2_THRESHOLD,
)
sys.exit(1)

logger.info("Accuracy check passed (R2=%.4f >= %.4f)", r2, MIN_R2_THRESHOLD)

Path("./Models").mkdir(parents=True, exist_ok=True)
model.save("./Models/yield_model.keras")
logger.info("Model saved to ./Models/yield_model.keras")

# model.save("./Models/yield_model{date}_{time}.keras")

# ---------- Feature Sensitivity Analysis ----------
# For each feature, perturb it by a small delta and measure how predicted
# yield changes. This tells us the direction and magnitude of each
# feature's influence, which informs the biochar feature adjustments in
# _calculate_biochar_yield.

logger.info("--- Feature Sensitivity Analysis ---")

feature_columns = YieldCalculator.MODEL_FEATURE_COLUMNS
baseline_features = X_test.to_numpy(dtype=np.float32)
baseline_preds = model.predict(baseline_features, verbose=0).flatten()

# Use 1 % of each feature's standard deviation as the perturbation step.
# For features with zero std (constant), fall back to 1 % of the mean or 0.01.
PERTURBATION_FRACTION = 0.01

for i, col in enumerate(feature_columns):
std = X_test[col].std()
mean = X_test[col].mean()
delta = std * PERTURBATION_FRACTION if std > 0 else abs(mean) * PERTURBATION_FRACTION or 0.01

perturbed = baseline_features.copy()
perturbed[:, i] += delta

perturbed_preds = model.predict(perturbed, verbose=0).flatten()
mean_yield_change = (perturbed_preds - baseline_preds).mean()
sensitivity = mean_yield_change / delta # yield change per unit feature change

direction = "INCREASE" if sensitivity > 0 else "DECREASE"

logger.info(
f" {col:25s} | std={std:.4f} mean={mean:.4f} delta={delta:.6f} "
f"| avg yield Δ = {mean_yield_change:+.4f} | sensitivity = {sensitivity:+.4f}/unit "
f"| To boost yield: {direction} this feature"
)

logger.info("--- End Sensitivity Analysis ---")

89 changes: 89 additions & 0 deletions backend/YieldPredictionModel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,95 @@ Export to Colab and run a script to train the model.

Import the model into repo

## Model Accuracy Requirements

The training script (`CreateAndTrainYieldCalculatorModel.py`) enforces a minimum
model accuracy on every run. If the trained model does not meet the threshold,
the script exits with a non-zero code, which causes Docker builds and CI
pipelines to fail with a clear error message.

### Current Threshold

| Metric | Minimum Value | Notes |
| ------------ | ------------- | -------------------------------------------- |
| R-squared | 0.0 | Measured on 20% held-out test set |

The threshold is defined as `MIN_R2_THRESHOLD` at the top of the training
script. It can be raised as the model architecture, feature set, or training
data improves.

### Why R-squared?

R-squared (coefficient of determination) measures the proportion of variance in
the target variable (`GrainYieldAirDry`) explained by the model. The model uses
crop type and four terrain features (`Crop`, `elev_mean_m`, `slope_mean_deg`,
`aspect_eastness`, `aspect_northness`). The threshold is a floor to catch
broken training runs rather than a quality target.

### What Happens on Failure

When R-squared falls below the threshold the training script:
1. Logs an `ERROR`-level message with the actual vs. required R-squared.
2. Exits with code 1.
3. The Docker build step in `Dockerfile` detects the non-zero exit and aborts.
4. CI reports the failure in the build logs with a descriptive error.

### How to Update the Threshold

1. Open `CreateAndTrainYieldCalculatorModel.py`.
2. Change `MIN_R2_THRESHOLD` to the new value.
3. Rebuild the Docker image locally (`docker compose build backend`) to verify
the model still passes.
4. Commit and push -- CI will enforce the new threshold automatically.

### Accuracy Report in Logs

Every training run prints an accuracy report to stdout/stderr:
```
--- Model Accuracy Report ---
Test Loss (MSE) : <value>
Test MAE : <value>
RMSE : <value>
R-squared (R2) : <value>
Min R2 Threshold: <value>
Training rows : <count>
Test rows : <count>
Features : Crop, elev_mean_m, slope_mean_deg, aspect_eastness, aspect_northness
--- End Accuracy Report ---
```

### Data Quality -- Cook Harvest

The training script cleans the Cook Farm harvest data before training:
- Columns with more than 1000 missing values are dropped (too sparse).
- Rows with any remaining missing values are removed.
- Metadata columns (`SampleID`, `QCCoverage`, `QCFlags`, `CropExists`, `ID2`,
`HarvestYear`) are dropped.
- **Zero-yield rows are removed.** These represent planting failures or
unharvested samples and would bias the model toward predicting lower yields.

### Model Features

The model uses five features (defined in `YieldCalculator.MODEL_FEATURE_COLUMNS`):

| Feature | Type | Range | Source |
| ------------------ | ----------- | ------------- | -------------- |
| `Crop` | Categorical | 0-11 (encoded)| Harvest data |
| `elev_mean_m` | Continuous | ~750-800 | DEM/GeoParser |
| `slope_mean_deg` | Continuous | ~0-20 | DEM/GeoParser |
| `aspect_eastness` | Continuous | -1 to 1 | DEM/GeoParser |
| `aspect_northness` | Continuous | -1 to 1 | DEM/GeoParser |

**Crop encoding** uses a fixed alphabetical mapping defined in
`YieldCalculator.CROP_ENCODING`. This must stay consistent between training and
inference. Do not use `sklearn.LabelEncoder` directly as it produces different
encodings depending on which values it sees.

**Feature scaling** is handled by a `BatchNormalization` layer at the front of
the neural network. The learned normalization parameters are stored inside the
`.keras` model file, so inference automatically applies the same scaling without
a separate scaler artifact.

## Model Artifacts in CI

Model files (`*.keras`) generated by `CreateAndTrainYieldCalculatorModel.py` are
Expand Down
29 changes: 21 additions & 8 deletions backend/YieldPredictionModel/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,44 @@
import pandas as pd
import numpy as np
import keras
from sklearn.preprocessing import LabelEncoder

def save_to_csv(path: str, df: pd.DataFrame):
out = Path(path)
out.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out, index=False)

def encode(df):
le = LabelEncoder()
"""Encode the Crop column to integers using the fixed mapping from YieldCalculator."""
from modules.Calculator import YieldCalculator

crop_values = df["Crop"]

# Handle duplicate "Crop" columns by using the first one.
if isinstance(crop_values, pd.DataFrame):
crop_values = crop_values.iloc[:, 0]

df["Crop"] = le.fit_transform(crop_values.astype(str))
encoded = crop_values.astype(str).map(YieldCalculator.CROP_ENCODING)
unknown = encoded.isna()
if unknown.any():
bad_codes = crop_values[unknown].unique().tolist()
raise ValueError(
f"Unknown crop code(s): {bad_codes}. "
f"Valid codes: {sorted(YieldCalculator.CROP_ENCODING.keys())}"
)
df["Crop"] = encoded

def Create_Model(input_dim):
model = keras.models.Sequential()

# Input layer
model.add(keras.layers.Input(shape=(input_dim,)))

# Normalize features so the network sees approximately zero-mean,
# unit-variance inputs regardless of original scale (e.g. elevation
# ~750-800 vs aspect ~-1..1). The learned statistics are stored in
# the .keras file so inference applies the same normalization.
model.add(keras.layers.BatchNormalization())

# Hidden layers
model.add(keras.layers.Dense(
units=128,
Expand All @@ -41,21 +56,19 @@ def Create_Model(input_dim):
activation='relu',
kernel_initializer='he_normal'
))
# Optional regularization (uncomment if you see overfitting)
# model.add(keras.layers.Dropout(0.2))

# Output layer for regression (one continuous value)
model.add(keras.layers.Dense(
units=1,
activation='linear', # << no activation = regression
activation='linear',
kernel_initializer='glorot_uniform'
))

# Compile for regression
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-3),
loss='mse', # mean squared error
metrics=['mae'] # mean absolute error
loss='mse',
metrics=['mae']
)

return model
1 change: 1 addition & 0 deletions backend/modules/Calculator/test_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
'centroid_lat': centroid_lat,
'centroid_lon': centroid_lon,
'pixel_count': 4,
'Crop': 'WW',
'elev_mean_m': elev_mean,
'elev_min_m': elev_mean - 1,
'elev_max_m': elev_mean + 1,
Expand Down
24 changes: 23 additions & 1 deletion backend/modules/Calculator/yield_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,20 @@ class YieldCalculator:

MODEL_LOCATION_ENV_VAR = "MODEL_LOCATION"
MODEL_FEATURE_COLUMNS = [
"Crop",
"elev_mean_m",
"slope_mean_deg",
"aspect_eastness",
"aspect_northness",
]

# Fixed crop-to-integer mapping for deterministic encoding across
# training and inference. Alphabetically sorted codes from the Cook
# Farm training CSV. Must stay in sync with crop_types.py.
CROP_ENCODING = {
"AL": 0, "GB": 1, "SB": 2, "SC": 3, "SP": 4, "SW": 5,
"WB": 6, "WC": 7, "WL": 8, "WP": 9, "WT": 10, "WW": 11,
}

# Base yield parameters (yield (idk what unit) per acre equivalent per grid cell)
BASE_YIELD = 50.0
Expand Down Expand Up @@ -156,7 +165,20 @@ def _calculate(self, df):
f"Model not loaded. Set {self.MODEL_LOCATION_ENV_VAR} to a valid model path and initialize with fetch_model=True."
)

features = df.loc[:, self.MODEL_FEATURE_COLUMNS].to_numpy(dtype=np.float32)
features = df.loc[:, self.MODEL_FEATURE_COLUMNS].copy()

# Encode Crop to its integer code when the column is still strings.
if features["Crop"].dtype == object:
features["Crop"] = features["Crop"].map(self.CROP_ENCODING)
unknown = features["Crop"].isna()
if unknown.any():
bad_codes = df.loc[unknown.values, "Crop"].unique().tolist()
raise ValueError(
f"Unknown crop code(s): {bad_codes}. "
f"Valid codes: {sorted(self.CROP_ENCODING.keys())}"
)

features = features.to_numpy(dtype=np.float32)
expected_dim = self.model.input_shape[-1]
if expected_dim is not None and features.shape[1] != expected_dim:
raise ValueError(
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ services:
context: ./backend
args:
OPENTOPOGRAPHY_API_KEY: ${OPENTOPOGRAPHY_API_KEY:-keykey}
CACHEBUST: ${CACHEBUST:-}
image: django-backend
container_name: django-backend
depends_on:
Expand Down
Loading