Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions merino/curated_recommendations/rankers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Algorithms for ranking curated recommendations."""

import sentry_sdk
import logging
import math
from copy import copy
from datetime import datetime, timedelta, timezone

Expand All @@ -20,6 +23,8 @@
from scipy.stats import beta
import numpy as np

logger = logging.getLogger(__name__)


def renumber_recommendations(recommendations: list[CuratedRecommendation]) -> None:
"""Renumber the receivedRank of each recommendation to be sequential.
Expand Down Expand Up @@ -57,6 +62,99 @@ def renumber_sections(ordered_sections: list[tuple[str, Section]]) -> dict[str,

TOP_STORIES_SECTION_KEY = "top_stories_section"

# For taking down reported content
DEFAULT_REPORT_RECS_RATIO_THRESHOLD = 0.001 # using a low number for now (0.1%)
DEFAULT_SAFEGUARD_CAP_TAKEDOWN_FRACTION = 0.50 # allow at most 50% of recs to be auto-removed


def takedown_reported_recommendations(
recs: list[CuratedRecommendation],
engagement_backend: EngagementBackend,
region: str | None = None,
report_ratio_threshold: float = DEFAULT_REPORT_RECS_RATIO_THRESHOLD,
safeguard_cap_takedown_fraction: float = DEFAULT_SAFEGUARD_CAP_TAKEDOWN_FRACTION,
) -> list[CuratedRecommendation]:
"""Takedown highly-reported content & return filtered list of recommendations.

- Exclude any recommendation which breaches (report_count / impression_count) > threshold.
- Apply a safety cap: allow a certain fraction (50%) of all available recommendations to be taken down automatically.
- Log an error for the excluded rec, send the error to Sentry.

:param recs: All recommendations to filter.
:param engagement_backend: Provides report_count & impression_count data by corpusItemId.
:param region: Optionally, the client's region, e.g. 'US'.
:param report_ratio_threshold: Threshold indicating which recommendation should be excluded.
:param safeguard_cap_takedown_fraction: Max fraction of recommendations that can be auto-removed.

:return: Filtered list of recommendations.
"""
# Save engagement metrics for logging: {corpusItemId: (report_ratio, reports, impressions)}
rec_eng_metrics: dict[str, tuple[float, int, int]] = {}

def _report_ratio_for(rec: CuratedRecommendation) -> float:
if engagement := engagement_backend.get(rec.corpusItemId, region):
_impressions = int(engagement.impression_count or 0)
_reports = int(engagement.report_count or 0)
if _impressions > 0:
report_ratio = _reports / _impressions
rec_eng_metrics[rec.corpusItemId] = (report_ratio, _reports, _impressions)
return report_ratio
return -1.0 # treat as safe (won’t breach threshold)

# Filter first; common case is empty
over = [rec for rec in recs if _report_ratio_for(rec) > report_ratio_threshold]
if not over:
return recs

# Compute the safeguard cap, # of recs safe to remove from total list of reported_recs
# rounded up to avoid 0 removals
max_recs_to_remove = math.ceil(len(recs) * safeguard_cap_takedown_fraction)

# Sort only the small over-threshold set by ratio; no tie-breakers.
over.sort(key=_report_ratio_for, reverse=True)

# Select top N to remove
recs_to_remove = over[:max_recs_to_remove]
removed_rec_ids = {r.corpusItemId for r in recs_to_remove}

for rec in recs_to_remove:
ratio, reports, impressions = rec_eng_metrics.get(rec.corpusItemId, (-1.0, 0, 0))
# Log a warning for our backend logs
logger.warning(
f"Excluding reported recommendation: '{rec.title}' ({rec.url}) was excluded due to high reports",
extra={
"corpus_item_id": rec.corpusItemId,
"report_ratio": ratio,
"reports": reports,
"impressions": impressions,
"threshold": report_ratio_threshold,
"region": region,
},
)

# Send a structured event to Sentry as a warning 🟡
sentry_sdk.capture_message(
f"Excluding reported recommendation: '{rec.title}' ({rec.url}) excluded due to high reports",
level="warning",
scope=lambda scope: scope.set_context(
"excluding reported recommendation",
{
"corpus_item_id": rec.corpusItemId,
"title": rec.title,
"url": str(rec.url),
"report_ratio": ratio,
"reports": reports,
"impressions": impressions,
"threshold": report_ratio_threshold,
"region": region,
},
),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! Sending a dedicated Sentry message gives us better context. 👍

)

# Return remaining recs
remaining_recs = [rec for rec in recs if rec.corpusItemId not in removed_rec_ids]
return remaining_recs


def thompson_sampling(
recs: list[CuratedRecommendation],
Expand Down
37 changes: 26 additions & 11 deletions merino/curated_recommendations/sections.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
section_thompson_sampling,
put_top_stories_first,
greedy_personalized_section_rank,
takedown_reported_recommendations,
)
from merino.curated_recommendations.utils import is_enrolled_in_experiment

Expand Down Expand Up @@ -534,16 +535,30 @@ async def get_sections(
rec for section in corpus_sections.values() for rec in section.recommendations
]

# 5. Rank all corpus recommendations globally by engagement to build top_stories_section
all_ranked_corpus_recommendations = thompson_sampling(
# 5. Remove reported recommendations
all_remaining_corpus_recommendations = takedown_reported_recommendations(
all_corpus_recommendations,
engagement_backend=engagement_backend,
region=region,
)

# 6. Update corpus_sections to make sure reported takedown recs are not present
remaining_ids = {rec.corpusItemId for rec in all_remaining_corpus_recommendations}
for cs in corpus_sections.values():
cs.recommendations = [
rec for rec in cs.recommendations if rec.corpusItemId in remaining_ids
]

# 7. Rank all corpus recommendations globally by engagement to build top_stories_section
all_ranked_corpus_recommendations = thompson_sampling(
all_remaining_corpus_recommendations,
engagement_backend=engagement_backend,
prior_backend=prior_backend,
region=region,
rescaler=rescaler,
)

# 6. Split top stories
# 8. Split top stories
# Use 2-row layout as default for Popular Today
top_stories_count = DOUBLE_ROW_TOP_STORIES_COUNT
popular_today_layout = layout_7_tiles_2_ads
Expand All @@ -554,11 +569,11 @@ async def get_sections(

top_stories_rec_ids = {rec.corpusItemId for rec in top_stories}

# 7. Remove top story recs from original corpus sections
# 9. Remove top story recs from original corpus sections
for cs in corpus_sections.values():
cs.recommendations = remove_top_story_recs(cs.recommendations, top_stories_rec_ids)

# 8. Rank remaining recs in sections by engagement
# 10. Rank remaining recs in sections by engagement
for cs in corpus_sections.values():
cs.recommendations = thompson_sampling(
cs.recommendations,
Expand All @@ -568,7 +583,7 @@ async def get_sections(
rescaler=rescaler,
)

# 9. Initialize sections with top stories
# 11. Initialize sections with top stories
sections: dict[str, Section] = {
"top_stories_section": Section(
receivedFeedRank=0,
Expand All @@ -578,13 +593,13 @@ async def get_sections(
)
}

# 10. Add remaining corpus sections
# 12. Add remaining corpus sections
sections.update(corpus_sections)

# 11. Prune undersized sections
# 13. Prune undersized sections
sections = get_sections_with_enough_items(sections)

# 12. Rank the sections according to follows and engagement. 'Top Stories' goes at the top.
# 14. Rank the sections according to follows and engagement. 'Top Stories' goes at the top.
sections = rank_sections(
sections,
request.sections,
Expand All @@ -593,10 +608,10 @@ async def get_sections(
experiment_rescaler=rescaler,
)

# 13. Apply final layout cycling to ranked sections except top_stories
# 15. Apply final layout cycling to ranked sections except top_stories
cycle_layouts_for_ranked_sections(sections, LAYOUT_CYCLE)

# 14. Apply ad adjustments
# 16. Apply ad adjustments
adjust_ads_in_sections(sections)

return sections
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,22 @@
class MockEngagementBackend(EngagementBackend):
"""Mock class implementing the protocol for EngagementBackend."""

def __init__(self):
# {corpusItemId: (reports, impressions)}
self.metrics: dict[str, tuple[int, int]] = {}

def get(self, corpus_item_id: str, region: str | None = None) -> Engagement | None:
"""Return random click and impression counts based on the scheduled corpus id and region."""
if corpus_item_id in self.metrics:
reports, impressions = self.metrics[corpus_item_id]
return Engagement(
corpus_item_id=corpus_item_id,
region=region,
click_count=0,
impression_count=impressions,
report_count=reports,
)

HIGH_CTR_ITEMS = {
"b2c10703-5377-4fe8-89d3-32fbd7288187": (
1_000_000 * ML_EXPERIMENT_SCALE,
Expand Down Expand Up @@ -2146,6 +2160,58 @@ def passed_region(call):
passed_region(call) == derived_region for call in spy.call_args_list
), f"No engagement.get(..., region={repr(derived_region)}) in {spy.call_args_list}"

@pytest.mark.asyncio
@pytest.mark.parametrize(
"reports,impressions,should_remove",
[
# Above threshold: 5 / 50 = 0.1 (10% > 0.1%) → should be removed
(5, 50, True),
# Below threshold: 1 / 200,000 = 0.000005 (0.0005% < 0.1%) → should stay
(1, 200_000, False),
# Exactly at threshold: 1 / 1,000 = 0.001 (0.1% == 0.1%) → should stay
(1, 1_000, False),
# No reports: 0 / 100 = 0 < 0.001 → should stay
(0, 100, False),
# No engagement data → treated as safe → should stay
(None, None, False),
],
)
async def test_takedown_reported_recommendations_parametrized(
self, engagement_backend, caplog, reports, impressions, should_remove
):
"""Verify takedown_reported_recommendations behaves correctly."""
corpus_rec_id = "080de671-e4de-4a3d-863f-8c6dd6980069"

if reports is not None and impressions is not None:
engagement_backend.metrics.update({corpus_rec_id: (reports, impressions)})

async with AsyncClient(app=app, base_url="http://test") as ac:
response = await ac.post(
"/api/v1/curated-recommendations",
json={
"locale": "en-US",
"feeds": ["sections"],
"experimentName": ExperimentName.ML_SECTIONS_EXPERIMENT.value,
"experimentBranch": "treatment",
},
)

assert response.status_code == 200
feeds = response.json()["feeds"]

response_ids = {
rec["corpusItemId"]
for sid, section in feeds.items()
if section
for rec in section.get("recommendations", [])
}

if should_remove:
assert corpus_rec_id not in response_ids
assert any("Excluding reported recommendation" in r.message for r in caplog.records)
else:
assert corpus_rec_id in response_ids


@pytest.mark.asyncio
async def test_curated_recommendations_enriched_with_icons(
Expand Down
Loading