diff --git a/README.md b/README.md index f047266..169abaf 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ source .nlgw/bin/activate Now we can install `poetry` ```bash -pip install poetry +pip install "poetry<2.0.0" ``` The following command installs all of the necessary dependencies for `nonlocal_gwfluxes`. diff --git a/era5_training/README.md b/era5_training/README.md index 7a379e1..a085160 100644 --- a/era5_training/README.md +++ b/era5_training/README.md @@ -51,7 +51,7 @@ test-data/ ### Ann ```bash -python inference.py -M ann -d global -v global -f uvthetaw -e 8 -m 1 -s 1 -t era5 -i inputs/ -c model-huggingface/ -o outputs/ --script +python inference.py -M ann -d global -v global -f uvthetaw -e 85 -m 1 -s 1 -t era5 -i inputs/ -c model-huggingface/ -o outputs/ --script ``` This will generate some test data and a torchscripted model, to be used by `infer.f90` and `infer.py` later on. @@ -86,7 +86,7 @@ python infer.py -M ann -t test-data/ -s . To test the newly generate torchscript models, use the following command: ```bash -bash compile-and-run.sh intel +bash compile-and-run.sh gcc ``` This will compile `infer.f90` into `infer.exe`. This requires having cuda installed on your system. It also requires `ftorch` to diff --git a/era5_training/batch_ann.sh b/era5_training/batch_ann.sh index e3f655f..240367a 100644 --- a/era5_training/batch_ann.sh +++ b/era5_training/batch_ann.sh @@ -1,5 +1,5 @@ #!/bin/bash -l -#PBS -N 1x1_uvthw +#PBS -N scripting #PBS -A USTN0009 #PBS -l select=1:ncpus=4:ngpus=1:mem=80GB #PBS -l walltime=01:00:00 @@ -33,19 +33,36 @@ source ~/nonlocal_gwfluxes/.nlgw/bin/activate # -o /glade/derecho/scratch/agupta/torch_saved_models/ +#python inference.py \ +# -M attention \ +# -d global \ +# -v global \ +# -f uvthetaw \ +# -e 119 \ +# -m 1 \ +# -s 3 \ +# -t era5 \ +# -i /glade/derecho/scratch/agupta/era5_training_data/ \ +# -c /glade/derecho/scratch/agupta/hugging_face_checkpoints/ \ +# -o /glade/derecho/scratch/agupta/gw_inference_files/ + + python inference.py \ - -M attention \ - -d global \ - -v global \ - -f uvthetaw \ - -e 119 \ - -m 1 \ - -s 3 \ - -t era5 \ - -i /glade/derecho/scratch/agupta/era5_training_data/ \ - -c /glade/derecho/scratch/agupta/hugging_face_checkpoints/ \ - -o /glade/derecho/scratch/agupta/gw_inference_files/ + -M ann \ + -d global \ + -v global \ + -f uvthetaw \ + -e 70 \ + -s 1 \ + -t era5 \ + -m 1 \ + -i inputs/ \ + -c model-huggingface/ \ + -o outputs/ \ + --script + +#python inference.py -M ann -d global -v global -f uvthetaw -e 85 -m 1 -s 1 -t era5 -i /glade/derecho/scratch/agupta/new_training_data/ -c /glade/derecho/scratch/agupta/hugging_face_checkpoints/ -o /glade/derecho/scratch/agupta/gw_inference_files/ --script diff --git a/era5_training/batch_unet.sh b/era5_training/batch_unet.sh index 8bc86c6..79bf3b2 100644 --- a/era5_training/batch_unet.sh +++ b/era5_training/batch_unet.sh @@ -24,25 +24,31 @@ source ~/nonlocal_gwfluxes/.nlgw/bin/activate #python training_attention_unet.py stratosphere_only uvthetawN2 -python training.py \ - -M attention \ - -d global \ - -v stratosphere_update \ - -f uvw \ - -i /glade/derecho/scratch/agupta/era5_training_data/ \ - -o /glade/derecho/scratch/agupta/torch_saved_models/ - - -#python inference.py \ -# -M attention \ -# -d global \ -# -v stratosphere_update \ -# -f uvw \ -# -e 100 \ -# -s 1 \ -# -t era5 \ -# -m 1 \ -# -i /glade/derecho/scratch/agupta/era5_training_data/ \ +#python training.py \ +# -M attention \ +# -d global \ +# -v stratosphere_update \ +# -f uvw \ +# -i /glade/derecho/scratch/agupta/era5_training_data/ \ +# -o /glade/derecho/scratch/agupta/torch_saved_models/ + + +python inference.py \ + -M attention \ + -d global \ + -v global \ + -f uvthetaw \ + -e 100 \ + -s 1 \ + -t era5 \ + -m 1 \ + -i inputs/ \ + -c model-huggingface/ \ + -o outputs/ \ + --script + + +# -i /glade/derecho/scratch/agupta/era5_training_data/ \ # -c /glade/derecho/scratch/agupta/torch_saved_models/ \ # -o /glade/derecho/scratch/agupta/gw_inference_files/ diff --git a/era5_training/compile-and-run.sh b/era5_training/compile-and-run.sh index 60c0491..7af1485 100755 --- a/era5_training/compile-and-run.sh +++ b/era5_training/compile-and-run.sh @@ -1,34 +1,15 @@ -COMP=$1 +FC=ifort +FFLAGS="" -if [[ ${COMP} == "intel" ]]; then - FC=ifort - FFLAGS="" - - # source /glade/u/home/tmeltzer/cam-test/debug_env.sh - - module purge - module load cesmdev/1.0 ncarenv/23.06 craype/2.7.20 linaro-forge/23.0 intel/2023.0.0 mkl/2023.0.0 - module load ncarcompilers/1.0.0 cmake/3.26.3 cray-mpich/8.1.25 hdf5-mpi/1.12.2 - module load netcdf-mpi/4.9.2 parallel-netcdf/1.12.3 parallelio/2.6.2-debug esmf/8.6.0b04-debug -elif [[ ${COMP} == "gcc" ]]; then - - FC=gfortran - FFLAGS="-ffree-line-length-none" - - module purge - module load ncarenv/24.12 gcc/12.4.0 cmake cuda/12.3.2 netcdf/4.9.3 -else - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - NC='\033[0m' # No Color - echo -e "${RED}ERROR:${YELLOW} required option missing. Please specify [${GREEN}gcc${YELLOW}] or [${GREEN}intel${YELLOW}] as compiler.${NC}" - exit 1 -fi +module --force purge +# these come from the environment listed in software_environment.txt in the CESM Case directory +module load cesmdev/1.0 ncarenv/23.06 craype/2.7.20 intel/2023.0.0 mkl/2023.0.0 ncarcompilers/1.0.0 +module load cmake/3.26.3 cray-mpich/8.1.25 hdf5-mpi/1.12.2 netcdf-mpi/4.9.2 parallel-netcdf/1.12.3 +module load parallelio/2.6.2 esmf/8.6.0b04 source ../.nlgw/bin/activate -FTORCH_ROOT="/glade/u/home/tmeltzer/FTorch/bin/ftorch_${COMP}" +FTORCH_ROOT="${HOME}/fresh/ftorch-install" NETCDF_LIB="${NETCDF}/lib" export LD_LIBRARY_PATH="${NETCDF_LIB}:${FTORCH_ROOT}/lib64:${LD_LIBRARY_PATH}" @@ -45,7 +26,6 @@ echo $COMMAND ${COMMAND} -# gdb -q --args ./infer.exe attention test-data/ . ./infer.exe attention test-data/ . echo echo "=========================================" diff --git a/era5_training/get-model-and-data.sh b/era5_training/get-model-and-data.sh index 059c60a..e53d6c3 100755 --- a/era5_training/get-model-and-data.sh +++ b/era5_training/get-model-and-data.sh @@ -5,10 +5,12 @@ mkdir -p inputs echo "retrieving model weights..." cd model-huggingface -wget https://huggingface.co/amangupta2/iccs_coupling_checkpoints/resolve/main/ann_cnn_1x1_global_global_era5_uvthetaw__train_epoch8.pt +wget https://huggingface.co/amangupta2/iccs_coupling_checkpoints/resolve/main/retrained_ann_cnn_1x1_global_global_era5_uvthetaw__train_epoch85.pt wget https://huggingface.co/amangupta2/iccs_coupling_checkpoints/resolve/main/ann_cnn_1x1_global_global_era5_uvthetaw__train_epoch94.pt wget https://huggingface.co/amangupta2/iccs_coupling_checkpoints/resolve/main/attnunet_era5_global_global_uvthetaw_mseloss_train_epoch119.pt cd .. +mv model-huggingface/retrained_ann_cnn_1x1_global_global_era5_uvthetaw__train_epoch85.pt model-huggingface/ann_cnn_1x1_global_global_era5_uvthetaw__train_epoch85.pt + echo "retrieving test input..." -(cd inputs && wget https://g-b56e81.7a577b.6fbd.data.globus.org/1x1_inputfeatures_u_v_theta_w_uw_vw_era5_training_data_hourly_2010_constant_mu_sigma_scaling01.nc) +(cd inputs && wget https://g-b56e81.7a577b.6fbd.data.globus.org/1x1_inputfeatures_u_v_theta_w_uw_vw_era5_training_data_hourly_2015_constant_mu_sigma_scaling01.nc) diff --git a/era5_training/infer.py b/era5_training/infer.py index 58154bd..8907be4 100644 --- a/era5_training/infer.py +++ b/era5_training/infer.py @@ -26,7 +26,8 @@ def main(): model = torch.jit.load(model_path) # run model inference - pred = model(torch.tensor(input_data).to(device)) + with torch.no_grad(): + pred = model(torch.tensor(input_data).to(device)) pred = pred.cpu().detach().numpy() print("pred.shape = ", pred.shape) diff --git a/era5_training/inference.py b/era5_training/inference.py index 3189f6b..506f371 100644 --- a/era5_training/inference.py +++ b/era5_training/inference.py @@ -108,7 +108,7 @@ print(f"output_dir={args.output_dir}") print(f"script={args.script}") -bs_train = 20 # 80 (80 works for most). (does not work for global uvthetaw) +bs_train = 5 # 20 # 80 (80 works for most). (does not work for global uvthetaw) bs_test = bs_train # -------------------------------------------------- @@ -136,11 +136,13 @@ odir = str(args.output_dir) + "/" pref = str(args.ckpt_dir) + "/" # "/scratch/users/ag4680/torch_saved_models/attention_unet/" if model == "ann": - ckpt = f"ann_cnn_{stencil}x{stencil}_{domain}_{vertical}_era5_{features}__train_epoch{epoch}.pt" + # ckpt = f"retrained_ann_cnn_{stencil}x{stencil}_{domain}_{vertical}_era5_{features}__train_epoch{epoch}.pt" + ckpt = f"retrained_L93_ann_cnn_{stencil}x{stencil}_{domain}_{vertical}_era5_{features}__train_epoch{epoch}.pt" log_filename = f"./{teston}_inference_ann_cnn_{stencil}x{stencil}_{domain}_{vertical}_{features}_ckpt_epoch_{epoch}.txt" elif model == "attention": ckpt = ( - f"attnunet_era5_{domain}_{vertical}_{features}_mseloss_train_epoch{str(epoch).zfill(2)}.pt" + # f"attnunet_era5_{domain}_{vertical}_{features}_mseloss_train_epoch{str(epoch).zfill(2)}.pt" + f"retrained_L93_attnunet_era5_{domain}_{vertical}_{features}_mseloss_train_epoch{epoch}.pt" ) log_filename = ( f"./{teston}_inference_attnunet_{domain}_{vertical}_{features}_ckpt_epoch_{epoch}.txt" @@ -157,7 +159,7 @@ # Define test files # ------- To test on one year of ERA5 data test_files = [] -test_years = np.array([2010]) +test_years = np.array([2015]) test_month = args.month # int(sys.argv[4]) # np.arange(1,13) logger.info(f"Inference for month {test_month}") if teston == "era5": @@ -174,7 +176,7 @@ ) elif vertical == "global" or vertical == "stratosphere_update": if stencil == 1: - pre = idir + f"1x1_inputfeatures_u_v_theta_w_uw_vw_era5_training_data_hourly_" + pre = idir + f"1x1_inputfeatures_u_v_theta_w_uw_vw_gcp_era5_training_data_hourly_" else: pre = ( idir @@ -183,7 +185,10 @@ for year in test_years: for months in np.arange(test_month, test_month + 1): - test_files.append(f"{pre}{year}_constant_mu_sigma_scaling{str(months).zfill(2)}.nc") + # test_files.append(f"{pre}{year}_constant_mu_sigma_scaling{str(months).zfill(2)}.nc") # usual + test_files.append( + f"{pre}{year}_L93_constant_mu_sigma_scaling{str(months).zfill(2)}.nc" + ) # L93 elif teston == "ifs": if vertical == "stratosphere_only": @@ -219,6 +224,7 @@ ) idim = testset.idim + odim = testset.odim hdim = 4 * idim diff --git a/utils/dataloader_definition.py b/utils/dataloader_definition.py index 2c65014..11328eb 100644 --- a/utils/dataloader_definition.py +++ b/utils/dataloader_definition.py @@ -51,13 +51,16 @@ def __init__(self, files, domain, vertical, stencil, manual_shuffle, features, r if self.vertical == "global": # 122 channels for each feature if self.features == "uvtheta": - self.v = np.arange(0, 369) # for u,v,theta + # self.v = np.arange(0, 369) # for u,v,theta + self.v = np.arange(0, 282) # for L93 elif self.features == "uvthetaw": - self.v = np.arange(0, 491) # for u,v,theta,w + # self.v = np.arange(0, 551) # for u,v,theta,w + self.v = np.arange(0, 375) # for L93 elif self.features == "uvw": - self.v = np.concatenate( - (np.arange(0, 247), np.arange(369, 491)), axis=0 - ) # for u,v,w + # self.v = np.concatenate( + # (np.arange(0, 247), np.arange(369, 551)), axis=0 + # ) # for u,v,w + self.v = np.concatenate((np.arange(0, 189), np.arange(282, 375)), axis=0) # for L93 self.w = np.arange(0, self.odim) # all vertical channels elif self.vertical == "stratosphere_only": @@ -86,7 +89,7 @@ def __init__(self, files, domain, vertical, stencil, manual_shuffle, features, r self.v = np.arange(0, 491) # for u,v,theta,w elif self.features == "uvw": self.v = np.concatenate( - (np.arange(0, 247), np.arange(369, 491)), axis=0 + (np.arange(0, 247), np.arange(369, 551)), axis=0 ) # for u,v,w self.w = np.concatenate( (np.arange(0, 60), np.arange(122, 182)), axis=0 @@ -296,13 +299,16 @@ def __init__(self, files, domain, vertical, manual_shuffle, features, region="1a if self.vertical == "global": # 122 channels for each feature if self.features == "uvtheta": - self.v = np.arange(3, 369) # for u,v,theta + self.v = np.arange(3, 282) # for L93 + # self.v = np.arange(3, 369) # for u,v,theta elif self.features == "uvthetaw": - self.v = np.arange(3, 491) # for u,v,theta,w + self.v = np.arange(3, 375) # for L93 + # self.v = np.arange(3, 551) # for u,v,theta,w elif self.features == "uvw": - self.v = np.concatenate( - (np.arange(3, 247), np.arange(369, 491)), axis=0 - ) # for u,v,w + self.v = np.concatenate((np.arange(3, 189), np.arange(282, 375)), axis=0) # for L93 + # self.v = np.concatenate( + # (np.arange(3, 247), np.arange(369, 551)), axis=0 + # ) # for u,v,w self.w = np.arange(0, self.odim) # all vertical channels elif self.vertical == "stratosphere_only": @@ -328,10 +334,10 @@ def __init__(self, files, domain, vertical, manual_shuffle, features, region="1a if self.features == "uvtheta": self.v = np.arange(3, 369) # for u,v,theta elif self.features == "uvthetaw": - self.v = np.arange(3, 491) # for u,v,theta,w + self.v = np.arange(3, 551) # for u,v,theta,w elif self.features == "uvw": self.v = np.concatenate( - (np.arange(3, 247), np.arange(369, 491)), axis=0 + (np.arange(3, 247), np.arange(369, 551)), axis=0 ) # for u,v,w self.w = np.concatenate( (np.arange(0, 60), np.arange(122, 182)), axis=0 diff --git a/utils/function_training.py b/utils/function_training.py index f3d54cd..ccd037f 100644 --- a/utils/function_training.py +++ b/utils/function_training.py @@ -192,7 +192,9 @@ def Inference_and_Save_ANN_CNN( INP = INP.reshape(T[0] * T[1], T[2], T[3], T[4]) T = OUT.shape OUT = OUT.reshape(T[0] * T[1], -1) - PRED = model(INP) + + with torch.no_grad(): + PRED = model(INP) if is_script: print("saving data...") @@ -205,7 +207,7 @@ def Inference_and_Save_ANN_CNN( xdata.to_netcdf(f"test-data/ann-cnn-{k}.nc") print("scripting...") - script_to_torchscript(model, filename="nlgw_ann-cnn_gpu_scripted.pt") + script_to_torchscript(model, filename=f"nlgw_ann-cnn_{device}_scripted.pt") print("complete") S = PRED.shape @@ -386,7 +388,7 @@ def Inference_and_Save_AttentionUNet( model.eval() count = 0 for i, (INP, OUT) in enumerate(testloader): - # print([i,count]) + # print([i, count]) INP = INP.to(device) S = OUT.shape o_output[count : count + S[0], :, :, :] = OUT[ diff --git a/utils/model_definition.py b/utils/model_definition.py index daa82f9..3860edc 100644 --- a/utils/model_definition.py +++ b/utils/model_definition.py @@ -47,27 +47,43 @@ def __init__(self, idim, odim, hdim, stencil, dropout=0.0): self.act_cnn = nn.ReLU() self.dropout0 = nn.Dropout(p=0.5 * self.dropout_prob) + self.dropout0 = nn.Dropout(p=0.5 * self.dropout_prob) # can define a block and divide it into blocks as well self.layer1 = nn.Linear(idim, hdim) # ,dtype=torch.float16) - self.act1 = nn.LeakyReLU() - + self.act1 = ( + nn.LeakyReLU() + ) # nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.GELU()#nn.ReLU() + self.bnorm1 = nn.BatchNorm1d(hdim) self.dropout = nn.Dropout(p=self.dropout_prob) - self.layer2 = nn.Linear(hdim, hdim) - self.act2 = nn.LeakyReLU() + self.act2 = ( + nn.LeakyReLU() + ) # nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.GELU()#nn.ReLU() + self.bnorm2 = nn.BatchNorm1d(hdim) # ------------------------------------------------------- self.layer3 = nn.Linear(hdim, hdim) - self.act3 = nn.LeakyReLU() + self.act3 = ( + nn.LeakyReLU() + ) # nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.GELU()#nn.ReLU() + self.bnorm3 = nn.BatchNorm1d(hdim) # ------------------------------------------------------- self.layer4 = nn.Linear(hdim, hdim) - self.act4 = nn.LeakyReLU() + self.act4 = ( + nn.LeakyReLU() + ) # nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.GELU()#nn.ReLU() + self.bnorm4 = nn.BatchNorm1d(2 * hdim) # -------------------------------------------------------- self.layer5 = nn.Linear(hdim, hdim) - self.act5 = nn.LeakyReLU() + self.act5 = ( + nn.LeakyReLU() + ) # nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.GELU()#nn.ReLU() + self.bnorm5 = nn.BatchNorm1d(hdim) # ------------------------------------------------------- self.layer6 = nn.Linear(hdim, 2 * odim) - self.act6 = nn.LeakyReLU() - + self.act6 = ( + nn.LeakyReLU() + ) # nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.LeakyReLU()#nn.Tanh()#nn.GELU()#nn.ReLU() + self.bnorm6 = nn.BatchNorm1d(2 * odim) self.output = nn.Linear(2 * odim, odim) def forward(self, x): @@ -122,23 +138,31 @@ def totalsize(self): class Conv_block(nn.Module): def __init__(self, ch_in, ch_out, kernel_size=3, stride=1, padding=1, bias=True): super().__init__() + + pad_layer = nn.Sequential( + nn.CircularPad2d((padding, padding, 0, 0)), + nn.ReplicationPad2d((0, 0, padding, padding)), + ) + self.conv = nn.Sequential( + pad_layer, nn.Conv2d( in_channels=ch_in, out_channels=ch_out, kernel_size=kernel_size, stride=stride, - padding=padding, + padding=0, bias=bias, ), nn.BatchNorm2d(ch_out), nn.ReLU(inplace=True), + pad_layer, nn.Conv2d( in_channels=ch_out, out_channels=ch_out, kernel_size=kernel_size, stride=stride, - padding=padding, + padding=0, bias=bias, ), nn.BatchNorm2d(ch_out), @@ -153,14 +177,21 @@ def forward(self, x): class Upsample(nn.Module): def __init__(self, ch_in, ch_out, kernel_size=3, stride=1, padding=1, bias=True): super().__init__() + + pad_layer = nn.Sequential( + nn.CircularPad2d((padding, padding, 0, 0)), + nn.ReplicationPad2d((0, 0, padding, padding)), + ) + self.up = nn.Sequential( + pad_layer, nn.Upsample(scale_factor=2), nn.Conv2d( in_channels=ch_in, out_channels=ch_out, kernel_size=kernel_size, - padding=padding, stride=stride, + padding=0, bias=bias, ), nn.BatchNorm2d(ch_out), @@ -176,43 +207,51 @@ class Attention_block(nn.Module): def __init__( self, F_x, F_g, F_int, kernel_size=3, stride=1, padding=1, bias=True, attn_3d=False ): + super().__init__() if attn_3d: self.F_attn = F_x else: self.F_attn = 1 - super().__init__() + pad_layer = nn.Sequential( + nn.CircularPad2d((padding, padding, 0, 0)), + nn.ReplicationPad2d((0, 0, padding, padding)), + ) + self.Wx = nn.Sequential( + pad_layer, nn.Conv2d( in_channels=F_x, out_channels=F_int, kernel_size=kernel_size, stride=stride, - padding=padding, + padding=0, bias=bias, ), nn.BatchNorm2d(F_int), ) self.Wg = nn.Sequential( + pad_layer, nn.Conv2d( in_channels=F_g, out_channels=F_int, kernel_size=kernel_size, stride=stride, - padding=padding, + padding=0, bias=bias, ), nn.BatchNorm2d(F_int), ) self.Psi = nn.Sequential( + pad_layer, nn.Conv2d( in_channels=F_int, out_channels=self.F_attn, kernel_size=kernel_size, - padding=padding, stride=stride, + padding=0, bias=bias, ), nn.BatchNorm2d(self.F_attn),