Skip to content
Draft
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,12 @@ venv.bak/
.vscode/*
*.code-workspace

# JetBrains IDEA Template
.idea

# Local storages
data/*
datasets/*/
models/*

# Log files
*.log
*.txt
2 changes: 1 addition & 1 deletion configs/BASE-CADA-VAE.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ CLS:
AMSGRAD: True
DATA:
FEAT_EMB:
PATH: "data/CUB/resnet101/"
PATH: "datasets/CUB/resnet101/"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Все ещё сомневаюсь насчет этого переименования

6 changes: 6 additions & 0 deletions datasets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Using built-in datasets
=======================

ZeroShotEval is provided with 2 build-in datasest:
- AWA2
- CUB
Comment on lines +4 to +6
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Что мешает использовать, например, SUN?

168 changes: 168 additions & 0 deletions datasets/parse_cub_resnet101_mat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Script for precomputed embeddings loading for
the datasets: CUB, SUN, AWA1, AWA2.
The script loads precomputed embeddings from .mat file and
transforms it into a unified format (pickle).
NOTE: Embeddings were computed using ResNet101 network pretrained on ImageNet
The data provided by unofficial resource and packed into a single .mat file.
!!! IMPORTANT !!!
Please note, that this script is designed for CUB, SUN, AWA1, AWA2
datasets only, and moreover these datasets must be downloaded from:
https://www.dropbox.com/sh/btoc495ytfbnbat/AAAaurkoKnnk0uV-swgF-gdSa?dl=0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Please note, that this script is designed for CUB, SUN, AWA1, AWA2
datasets only, and moreover these datasets must be downloaded from:
https://www.dropbox.com/sh/btoc495ytfbnbat/AAAaurkoKnnk0uV-swgF-gdSa?dl=0
Please note, that this script is designed for CUB, SUN, AWA1, AWA2
.. _datasets:
https://www.dropbox.com/sh/btoc495ytfbnbat/AAAaurkoKnnk0uV-swgF-gdSa?dl=0

"""

import argparse
import os
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from scipy import io as sio

from zeroshoteval.utils import setup_logger

logger = setup_logger()


def init_arguments():
parser = argparse.ArgumentParser(description="Script for embeddings transformation to pickle file")
parser.add_argument("--path", required=True,
help="Path to the dataset to transform.")
parser.add_argument("--output-dir", required=True,
help="Path to the output directory where the transformed embeddings will be saved.")
return parser


def load_arguments():
parser = init_arguments()
args = parser.parse_args()

args.path = Path(args.path)
args.output_dir = Path(args.output_dir)

return args


def read_data(images_mat_file: str,
Copy link

@termit209 termit209 Dec 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

может более понятнее будет название read_data_mat или что-то похожее?

cls_attributes_mat_file: str,
root_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
mat_data: Dict = sio.loadmat(os.path.join(root_path, images_mat_file))
img_data: np.ndarray = mat_data["features"].T

# NOTE: numpy array index starts from 0, matlab starts from 1
labels_data: np.ndarray = mat_data["labels"].astype(int).squeeze() - 1

mat_data: Dict = sio.loadmat(os.path.join(root_path, cls_attributes_mat_file))
cls_attr_data: np.ndarray = mat_data["att"].T

cls_attr_data = extend_clsattr_data(cls_attr_data, labels_data)

# TODO: deal with supporting data
# if dataset == "CUB":
# with open(root_path / "CUB_supporting_data.p", "rb") as h:
# x = pickle.load(h)
# for key, value in x.items():
# aux_modalities_embs[key] = value

logger.info("Image data, class attributes data and labels were successfully parsed.")
return img_data, cls_attr_data, labels_data


def extend_clsattr_data(cls_attr_data: np.ndarray, labels_data: np.ndarray) -> np.ndarray:
"""
Extends clsattr_data from shape (num_classes, emb_size) to (num_instances, emb_size) by duplicating class embeddings
according to labels in labels_data.
Args:
cls_attr_data: Array with embeddings of class attributes.
labels_data: Array with dataset labels.
Returns: Array with extended clsattr_data.
"""
if len(cls_attr_data) == len(labels_data):
logger.warning("Class attributes data seems to be already extended. Length of array is the same as labels.")
return cls_attr_data
else:
extended_clsattr = [cls_attr_data[label] for label in labels_data]
logger.debug("Class attributes data were successfully extended according to labels data.")
return np.stack(extended_clsattr)


def read_data_splits(splits_mat_file: str, root_path: str) -> pd.DataFrame:
mat_data = sio.loadmat(os.path.join(root_path, splits_mat_file))

df = pd.DataFrame(columns=["id", "is_train", "is_seen"])

# NOTE: numpy array index starts from 0, matlab starts from 1
train_indexes = mat_data["trainval_loc"].squeeze() - 1
test_seen_indexes = mat_data["test_seen_loc"].squeeze() - 1
test_unseen_indexes = mat_data["test_unseen_loc"].squeeze() - 1

# NOTE: there are two more fields in mat files that are not in use now, but can be used
# train_loc = mat_data['train_loc'].squeeze() - 1 #--> train_feature = TRAIN SEEN
# val_unseen_loc = mat_data['val_loc'].squeeze() - 1 #--> test_unseen_feature = TEST UNSEEN
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Может стоит удалить?


for part, params in zip((train_indexes, test_seen_indexes, test_unseen_indexes), ((1, 1), (0, 1), (0, 0))):
part_df = pd.DataFrame(columns=df.columns)
part_df.loc[:, "id"] = part
part_df.loc[:, "is_train"] = params[0]
part_df.loc[:, "is_seen"] = params[1]
df = df.append(part_df)

df.reset_index(drop=True, inplace=True)

logger.info("Data splits were successfully parsed.")
return df


def parse_mat_dataset(path: str, mat_data_file="res101.mat", mat_attributes_file="att_splits.mat"):
"""
Loads specified dataset to dictionary structure
for further saving to pickle file.
"""
path = Path(path)
dataset: str = path.name.upper()
valid_datasets = ["CUB", "SUN", "AWA1", "AWA2"]
assert dataset in valid_datasets, \
f"Unknown dataset! This script is aimed on datasets: {valid_datasets}"

logger.info(f"Parsing dataset {dataset} from mat files into NumPy arrays. Source path: {path}")

img_data, cls_attr_data, labels_data = read_data(mat_data_file, mat_attributes_file, root_path=path)

splits_df = read_data_splits(mat_attributes_file, root_path=path)

return img_data, cls_attr_data, labels_data, splits_df


def save_data(img_data: np.ndarray,
cls_attr_data: np.ndarray,
labels_data: np.ndarray,
splits_df: pd.DataFrame,
dataset: str,
save_dir: str):
save_dir = Path(save_dir)

if not os.path.isdir(save_dir):
os.makedirs(save_dir)

np.save(file=save_dir / f"{dataset}_img_embeddings", arr=img_data)
np.save(file=save_dir / f"{dataset}_clsattr_embeddings", arr=cls_attr_data)
np.save(file=save_dir / f"{dataset}_labels", arr=labels_data)

splits_df.to_csv(save_dir / f"{dataset}_splits.csv", index=False)

logger.info(f"Parsed data for {dataset} dataset was successfully saved to {save_dir.absolute()}")


if __name__ == "__main__":
args = load_arguments()

img, cls_attr, labels, splits = parse_mat_dataset(path=args.path)

dataset_name = Path(args.path).name
save_data(img, cls_attr, labels, splits, dataset_name, save_dir=args.output_dir)
15 changes: 0 additions & 15 deletions linter.sh.save

This file was deleted.

3 changes: 0 additions & 3 deletions misc/README.md

This file was deleted.

18 changes: 0 additions & 18 deletions misc/awa2_download.sh

This file was deleted.

Loading