-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathmulti_obj_sampling.py
More file actions
166 lines (130 loc) · 8.27 KB
/
multi_obj_sampling.py
File metadata and controls
166 lines (130 loc) · 8.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
# Samples from N buckets where each bucket corresponds to a single objective
class MultiObjectiveSamplingFromBucketsElicitation:
# Objectives is a dictionary mapping objective name to its implementation (e.g. we can use different implementations of diversity etc..)
def __init__(self, rating_matrix, distance_matrix, n_relevance_buckets, n_diversity_buckets, n_novelty_buckets, n_samples_per_bucket, k=1.0, **kwargs):
self.rating_matrix = rating_matrix
#self.similarity_matrix = similarity_matrix
self.distance_matrix = distance_matrix
self.n_buckets = {
"relevance": n_relevance_buckets,
"diversity": n_diversity_buckets,
"novelty": n_novelty_buckets
}
self.n_samples_per_bucket = {
"relevance": [n_samples_per_bucket] * n_relevance_buckets,
"diversity": [n_samples_per_bucket] * n_diversity_buckets,
"novelty": [n_samples_per_bucket] * n_novelty_buckets
}
self.k = k
def _calculate_item_popularities(self, rating_matrix):
return np.power(np.sum(rating_matrix > 0.0, axis=0) / rating_matrix.shape[0], self.k)
# Relevances are simply mean ratings of each item
def _calculate_item_relevances(self, rating_matrix):
return np.power(rating_matrix.mean(axis=0), self.k) # Beware that zeroes (non-rated) items are included as well
# Novelties are inverse popularity
def _calculate_item_novelties(self, rating_matrix):
return -self._calculate_item_popularities(rating_matrix)
def get_initial_data(self, movie_indices_to_ignore=[]):
# We first sample relevance bucket
# Then we sample novelty bucket
# And in the very end, we CALCULATE (based on already sampled items) diversity and sample diversity bucket
movie_indices_to_ignore_np = np.array(movie_indices_to_ignore)
relevances = self._calculate_item_relevances(self.rating_matrix)
novelties = self._calculate_item_novelties(self.rating_matrix)
if movie_indices_to_ignore:
relevances[movie_indices_to_ignore_np] = 0.0 # This will cause that ignore items wont be sampled
novelties[movie_indices_to_ignore_np] = 0.0 # This will cause that ignore items wont be sampled
relevance_indices = np.argsort(-relevances)
sorted_relevances = relevances[relevance_indices]
sorted_items_by_relevance = np.arange(relevances.shape[0])[relevance_indices]
assert sorted_relevances.ndim == sorted_items_by_relevance.ndim
n_items_total = sum([sum(l) for l in self.n_samples_per_bucket.values()])
result = np.zeros((n_items_total, ), dtype=np.int32)
extra_data = []
bucket_idx = 0
offset = 0
# Fill in relevance buckets
for items_bucket, relevances_bucket, n_samples in zip(
np.array_split(sorted_items_by_relevance, self.n_buckets["relevance"]),
np.array_split(sorted_relevances, self.n_buckets["relevance"]),
self.n_samples_per_bucket["relevance"]
):
samples = np.random.choice(items_bucket, size=n_samples, p=relevances_bucket/relevances_bucket.sum(), replace=False)
result[offset:offset+n_samples] = samples
offset += n_samples
extra_data.extend([f"relevance_bucket with idx={bucket_idx + 1}/{self.n_buckets['relevance']}"] * n_samples)
bucket_idx += 1
# Zero everything selected in relevance sampling
novelties[result[:offset]] = 0.0 # This will cause that ignore items wont be sampled
novelty_indices = np.argsort(-novelties)
sorted_novelties = novelties[novelty_indices]
sorted_items_by_novelty = np.arange(novelties.shape[0])[novelty_indices]
assert sorted_novelties.ndim == sorted_items_by_novelty.ndim
bucket_idx = 0
# Fill in novelty buckets
for items_bucket, novelties_bucket, n_samples in zip(
np.array_split(sorted_items_by_novelty, self.n_buckets["novelty"]),
np.array_split(sorted_novelties, self.n_buckets["novelty"]),
self.n_samples_per_bucket["novelty"]
):
samples = np.random.choice(items_bucket, size=n_samples, p=novelties_bucket/novelties_bucket.sum(), replace=False)
result[offset:offset+n_samples] = samples
offset += n_samples
extra_data.extend([f"novelty_bucket with idx={bucket_idx + 1}/{self.n_buckets['novelty']}"] * n_samples)
bucket_idx += 1
# Set selected so far to use it as a filter below
selected_so_far = result[:offset]
# Calculate diversities
#similarity_matrix = np.float32(squareform(pdist(self.rating_matrix.T, "cosine")))
#distance_matrix = 1.0 - self.similarity_matrix
accums = np.add.accumulate(self.n_samples_per_bucket["diversity"])
# For the total number of items we have to sample across all diversity buckets
for i in range(sum(self.n_samples_per_bucket["diversity"])):
# Compute "approximate" diversity of each item to the list we have so far
diversities = self.distance_matrix[result[:offset]].sum(axis=0)
if movie_indices_to_ignore:
diversities[movie_indices_to_ignore_np] = 0.0 # This will cause that ignore items wont be sampled
diversities[selected_so_far] = 0.0 # Filter out movies selected so far
diversities /= diversities.sum() # Normalize to 1
# Prepare buckets based on diversities of all items w.r.t. CURRENT set of sampled items
diversity_indices = np.argsort(-diversities)
sorted_diversities = diversities[diversity_indices]
sorted_items_by_diversity = np.arange(diversities.shape[0])[diversity_indices]
assert sorted_diversities.ndim == sorted_items_by_diversity.ndim
# Find the corresponding bucket for the given item
current_target_bucket = np.searchsorted(accums, i, "right")
items_bucket = np.array_split(sorted_items_by_diversity, self.n_buckets["diversity"])[current_target_bucket]
diversities_bucket = np.array_split(sorted_diversities, self.n_buckets["diversity"])[current_target_bucket]
result[offset:offset+1] = np.random.choice(items_bucket, size=1, p=diversities_bucket/diversities_bucket.sum(), replace=False)
offset += 1
extra_data.append(f"diversity_bucket with idx={current_target_bucket + 1}/{self.n_buckets['diversity']}")
# # For each diversity bucket
# for n_samples in self.n_samples_per_bucket["diversity"]:
# # For each item in individual bucket
# for _ in range(n_samples):
# # Compute "approximate" diversity of each item to the list we have so far
# diversities = distance_matrix[result[:offset]].sum(axis=0)
# diversities /= diversities.sum() # Normalize to 1
# result[offset:offset+1] = np.random.choice()
# offset += 1
# diversities = self._calculate_item_diversities(self.rating_matrix, result[:offset])
# diversity_indices = np.argsort(-diversities)
# sorted_diversities = diversities[diversity_indices]
# sorted_items_by_diversity = np.arange(diversities.shape[0])[diversity_indices]
# assert sorted_diversities.ndim == sorted_items_by_diversity.ndim
# # Fill in diversity buckets
# for items_bucket, diversities_bucket, n_samples in zip(
# np.array_split(sorted_items_by_diversity, self.n_buckets["diversity"]),
# np.array_split(sorted_diversities, self.n_buckets["diversity"]),
# self.n_samples_per_bucket["diversity"]
# ):
# samples = np.random.choice(items_bucket, size=n_samples, p=diversities_bucket/diversities_bucket.sum(), replace=False)
# result[offset:offset+n_samples] = samples
# offset += n_samples
#np.random.shuffle(result)
assert len(extra_data) == result.shape[0], f"{len(extra_data)}!={result.shape[0]}"
assert result.shape[0] == n_items_total, f"{result.shape[0]}!={n_items_total}"
p = np.random.permutation(len(extra_data))
extra_data = np.array(extra_data)
return result[p] #, extra_data[p]