Skip to content

Commit 32d9dcd

Browse files
committed
worker: add a job to check for typosquats
1 parent 66d4063 commit 32d9dcd

File tree

11 files changed

+700
-8
lines changed

11 files changed

+700
-8
lines changed

src/admin/enqueue_job.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::db;
2-
use crate::schema::background_jobs;
2+
use crate::schema::{background_jobs, crates};
33
use crate::worker::jobs;
44
use crate::worker::swirl::BackgroundJob;
55
use anyhow::Result;
@@ -26,6 +26,10 @@ pub enum Command {
2626
#[arg(long = "dry-run")]
2727
dry_run: bool,
2828
},
29+
CheckTyposquat {
30+
#[arg()]
31+
name: String,
32+
},
2933
}
3034

3135
pub fn run(command: Command) -> Result<()> {
@@ -60,6 +64,21 @@ pub fn run(command: Command) -> Result<()> {
6064
Command::NormalizeIndex { dry_run } => {
6165
jobs::NormalizeIndex::new(dry_run).enqueue(conn)?;
6266
}
67+
Command::CheckTyposquat { name } => {
68+
// The job will fail if the crate doesn't actually exist, so let's check that up front.
69+
if crates::table
70+
.filter(crates::name.eq(&name))
71+
.count()
72+
.get_result::<i64>(conn)?
73+
== 0
74+
{
75+
anyhow::bail!(
76+
"cannot enqueue a typosquat check for a crate that doesn't exist: {name}"
77+
);
78+
}
79+
80+
jobs::CheckTyposquat::new(&name).enqueue(conn)?;
81+
}
6382
};
6483

6584
Ok(())

src/controllers/krate/publish.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! Functionality related to publishing a new crate or version of a crate.
22
33
use crate::auth::AuthCheck;
4-
use crate::worker::jobs;
4+
use crate::worker::jobs::{self, CheckTyposquat};
55
use crate::worker::swirl::BackgroundJob;
66
use axum::body::Bytes;
77
use cargo_manifest::{Dependency, DepsSet, TargetDepsSet};
@@ -85,7 +85,7 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
8585
// this query should only be used for the endpoint scope calculation
8686
// since a race condition there would only cause `publish-new` instead of
8787
// `publish-update` to be used.
88-
let existing_crate = Crate::by_name(&metadata.name)
88+
let existing_crate: Option<Crate> = Crate::by_name(&metadata.name)
8989
.first::<Crate>(conn)
9090
.optional()?;
9191

@@ -222,9 +222,10 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
222222
return Err(cargo_err("expected at most 5 categories per crate"));
223223
}
224224

225-
let max_features = existing_crate
226-
.and_then(|c| c.max_features.map(|mf| mf as usize))
227-
.unwrap_or(app.config.max_features);
225+
let max_features = match &existing_crate {
226+
Some(c) => c.max_features.map(|mf| mf as usize),
227+
None => None,
228+
}.unwrap_or(app.config.max_features);
228229

229230
let features = tarball_info.manifest.features.unwrap_or_default();
230231
let num_features = features.len();
@@ -393,6 +394,11 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
393394

394395
jobs::enqueue_sync_to_index(&krate.name, conn)?;
395396

397+
// Experiment: check new crates for potential typosquatting.
398+
if existing_crate.is_none() {
399+
CheckTyposquat::new(&krate.name).enqueue(conn)?;
400+
}
401+
396402
// The `other` field on `PublishWarnings` was introduced to handle a temporary warning
397403
// that is no longer needed. As such, crates.io currently does not return any `other`
398404
// warnings at this time, but if we need to, the field is available.

src/worker/environment.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@ use crate::fastly::Fastly;
44
use crate::storage::Storage;
55
use crate::Emails;
66
use crates_io_index::{Repository, RepositoryConfig};
7+
use diesel::PgConnection;
78
use parking_lot::{Mutex, MutexGuard};
89
use std::ops::{Deref, DerefMut};
9-
use std::sync::Arc;
10+
use std::sync::{Arc, OnceLock};
1011
use std::time::Instant;
1112

13+
use super::typosquat;
14+
1215
pub struct Environment {
1316
repository_config: RepositoryConfig,
1417
repository: Mutex<Option<Repository>>,
@@ -17,6 +20,9 @@ pub struct Environment {
1720
pub storage: Arc<Storage>,
1821
pub connection_pool: DieselPool,
1922
pub emails: Emails,
23+
24+
/// A lazily initialised cache of the most popular crates ready to use in typosquatting checks.
25+
typosquat_cache: OnceLock<Result<typosquat::Cache, typosquat::CacheError>>,
2026
}
2127

2228
impl Environment {
@@ -36,6 +42,7 @@ impl Environment {
3642
storage,
3743
connection_pool,
3844
emails,
45+
typosquat_cache: OnceLock::default(),
3946
}
4047
}
4148

@@ -65,6 +72,19 @@ impl Environment {
6572
pub(crate) fn fastly(&self) -> Option<&Fastly> {
6673
self.fastly.as_ref()
6774
}
75+
76+
/// Returns the typosquatting cache, initialising it if required.
77+
pub(crate) fn typosquat_cache(
78+
&self,
79+
conn: &mut PgConnection,
80+
) -> Result<&typosquat::Cache, typosquat::CacheError> {
81+
// We have to pass conn back in here because the caller might be in a transaction, and
82+
// getting a new connection here to query crates can result in a deadlock.
83+
self.typosquat_cache
84+
.get_or_init(|| typosquat::Cache::from_env(conn))
85+
.as_ref()
86+
.map_err(|e| e.clone())
87+
}
6888
}
6989

7090
pub struct RepositoryLock<'a> {

src/worker/jobs/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ mod daily_db_maintenance;
99
pub mod dump_db;
1010
mod git;
1111
mod readmes;
12+
mod typosquat;
1213
mod update_downloads;
1314

1415
pub use self::daily_db_maintenance::DailyDbMaintenance;
1516
pub use self::dump_db::DumpDb;
1617
pub use self::git::{NormalizeIndex, SquashIndex, SyncToGitIndex, SyncToSparseIndex};
1718
pub use self::readmes::RenderAndUploadReadme;
19+
pub use self::typosquat::CheckTyposquat;
1820
pub use self::update_downloads::UpdateDownloads;
1921

2022
/// Enqueue both index sync jobs (git and sparse) for a crate, unless they

src/worker/jobs/typosquat.rs

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
use std::sync::Arc;
2+
3+
use diesel::PgConnection;
4+
use typomania::Package;
5+
6+
use crate::{
7+
worker::{
8+
swirl::{BackgroundJob, PerformState},
9+
typosquat::{Cache, Crate},
10+
Environment,
11+
},
12+
Emails,
13+
};
14+
15+
/// A job to check the name of a newly published crate against the most popular crates to see if
16+
/// the new crate might be typosquatting an existing, popular crate.
17+
#[derive(Serialize, Deserialize, Debug)]
18+
pub struct CheckTyposquat {
19+
name: String,
20+
}
21+
22+
impl CheckTyposquat {
23+
pub fn new(name: &str) -> Self {
24+
Self { name: name.into() }
25+
}
26+
}
27+
28+
impl BackgroundJob for CheckTyposquat {
29+
const JOB_NAME: &'static str = "check_typosquat";
30+
31+
type Context = Arc<Environment>;
32+
33+
#[instrument(skip(state, env), err)]
34+
fn run(&self, state: PerformState<'_>, env: &Self::Context) -> anyhow::Result<()> {
35+
let cache = env.typosquat_cache(state.conn)?;
36+
check(&env.emails, cache, state.conn, &self.name)
37+
}
38+
}
39+
40+
fn check(
41+
emails: &Emails,
42+
cache: &Cache,
43+
conn: &mut PgConnection,
44+
name: &str,
45+
) -> anyhow::Result<()> {
46+
if let Some(harness) = cache.get_harness() {
47+
info!(name, "Checking new crate for potential typosquatting");
48+
49+
let krate: Box<dyn Package> = Box::new(Crate::from_name(conn, name)?);
50+
let squats = harness.check_package(name, krate)?;
51+
if !squats.is_empty() {
52+
// Well, well, well. For now, the only action we'll take is to e-mail people who
53+
// hopefully care to check into things more closely.
54+
info!(?squats, "Found potential typosquatting");
55+
56+
for email in cache.iter_emails() {
57+
if let Err(e) = emails.send_possible_typosquat_notification(email, name, &squats) {
58+
error!(?e, ?email, "Failed to send possible typosquat notification");
59+
}
60+
}
61+
}
62+
}
63+
64+
Ok(())
65+
}
66+
67+
#[cfg(test)]
68+
mod tests {
69+
use crate::{test_util::pg_connection, worker::typosquat::test_util::Faker};
70+
71+
use super::*;
72+
73+
#[test]
74+
fn integration() -> anyhow::Result<()> {
75+
let emails = Emails::new_in_memory();
76+
let mut faker = Faker::new(pg_connection());
77+
78+
// Set up a user and a popular crate to match against.
79+
let user = faker.user("a")?;
80+
faker.crate_and_version("my-crate", "It's awesome", &user, 100)?;
81+
82+
// Prime the cache so it only includes the crate we just created.
83+
let cache = Cache::new(vec!["[email protected]".to_string()], faker.borrow_conn())?;
84+
85+
// Now we'll create new crates: one problematic, one not so.
86+
let other_user = faker.user("b")?;
87+
let (angel, _version) = faker.crate_and_version(
88+
"innocent-crate",
89+
"I'm just a simple, innocent crate",
90+
&other_user,
91+
0,
92+
)?;
93+
let (demon, _version) = faker.crate_and_version(
94+
"mycrate",
95+
"I'm even more innocent, obviously",
96+
&other_user,
97+
0,
98+
)?;
99+
100+
// OK, we're done faking stuff.
101+
let mut conn = faker.into_conn();
102+
103+
// Run the check with a crate that shouldn't cause problems.
104+
check(&emails, &cache, &mut conn, &angel.name)?;
105+
assert!(emails.mails_in_memory().unwrap().is_empty());
106+
107+
// Now run the check with a less innocent crate.
108+
check(&emails, &cache, &mut conn, &demon.name)?;
109+
let sent_mail = emails.mails_in_memory().unwrap();
110+
assert!(!sent_mail.is_empty());
111+
let sent = sent_mail.into_iter().next().unwrap();
112+
assert_eq!(&sent.to, "[email protected]");
113+
114+
Ok(())
115+
}
116+
}

src/worker/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::sync::Arc;
1111
mod environment;
1212
pub mod jobs;
1313
pub mod swirl;
14+
mod typosquat;
1415

1516
pub use self::environment::Environment;
1617

@@ -20,7 +21,8 @@ pub trait RunnerExt {
2021

2122
impl RunnerExt for Runner<Arc<Environment>> {
2223
fn register_crates_io_job_types(self) -> Self {
23-
self.register_job_type::<jobs::DailyDbMaintenance>()
24+
self.register_job_type::<jobs::CheckTyposquat>()
25+
.register_job_type::<jobs::DailyDbMaintenance>()
2426
.register_job_type::<jobs::DumpDb>()
2527
.register_job_type::<jobs::NormalizeIndex>()
2628
.register_job_type::<jobs::RenderAndUploadReadme>()

src/worker/typosquat/cache.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
use std::sync::Arc;
2+
3+
use diesel::PgConnection;
4+
use thiserror::Error;
5+
use typomania::{
6+
checks::{Bitflips, Omitted, SwappedWords, Typos},
7+
Harness,
8+
};
9+
10+
use super::{config, database::TopCrates};
11+
12+
static NOTIFICATION_EMAILS_ENV: &str = "TYPOSQUAT_NOTIFICATION_EMAILS";
13+
14+
/// A cache containing everything we need to run typosquatting checks.
15+
///
16+
/// Specifically, this includes a corpus of popular crates attached to a typomania harness, and a
17+
/// list of e-mail addresses that we'll send notifications to if potential typosquatting is
18+
/// discovered.
19+
pub struct Cache {
20+
emails: Vec<String>,
21+
harness: Option<Harness<TopCrates>>,
22+
}
23+
24+
impl Cache {
25+
/// Instantiates a new [`Cache`] from the environment.
26+
///
27+
/// This reads the [`NOTIFICATION_EMAILS_ENV`] environment variable to get the list of e-mail
28+
/// addresses to send notifications to, then invokes [`Cache::new`] to read popular crates from
29+
/// the database.
30+
#[instrument(skip_all, err)]
31+
pub fn from_env(conn: &mut PgConnection) -> Result<Self, Error> {
32+
let emails: Vec<String> = crates_io_env_vars::var(NOTIFICATION_EMAILS_ENV)
33+
.map_err(|e| Error::Environment {
34+
name: NOTIFICATION_EMAILS_ENV.into(),
35+
source: Arc::new(e),
36+
})?
37+
.unwrap_or_default()
38+
.split(',')
39+
.map(|s| s.trim().to_owned())
40+
.filter(|s| !s.is_empty())
41+
.collect();
42+
43+
if emails.is_empty() {
44+
// If we're not notifying anyone, then there's really not much to do here.
45+
warn!("$TYPOSQUAT_NOTIFICATION_EMAILS is not set; no typosquatting notifications will be sent");
46+
Ok(Self {
47+
emails,
48+
harness: None,
49+
})
50+
} else {
51+
// Otherwise, let's go get the top crates and build a corpus.
52+
Self::new(emails, conn)
53+
}
54+
}
55+
56+
/// Instantiates a cache by querying popular crates and building them into a typomania harness.
57+
///
58+
/// This relies on configuration in the [`super::config`] module.
59+
pub fn new(emails: Vec<String>, conn: &mut PgConnection) -> Result<Self, Error> {
60+
let top = TopCrates::new(conn, config::TOP_CRATES)?;
61+
62+
Ok(Self {
63+
emails,
64+
harness: Some(
65+
Harness::builder()
66+
.with_check(Bitflips::new(
67+
config::CRATE_NAME_ALPHABET,
68+
top.crates.keys().map(String::as_str),
69+
))
70+
.with_check(Omitted::new(config::CRATE_NAME_ALPHABET))
71+
.with_check(SwappedWords::new("-_"))
72+
.with_check(Typos::new(config::TYPOS.iter().map(|(c, typos)| {
73+
(*c, typos.iter().map(|ss| ss.to_string()).collect())
74+
})))
75+
.build(top),
76+
),
77+
})
78+
}
79+
80+
pub fn get_harness(&self) -> Option<&Harness<TopCrates>> {
81+
self.harness.as_ref()
82+
}
83+
84+
pub fn iter_emails(&self) -> impl Iterator<Item = &str> {
85+
self.emails.iter().map(String::as_str)
86+
}
87+
}
88+
89+
// Because the error returned from Cache::new() gets memoised in the environment, we either need to
90+
// return it by reference from Environment::typosquat_cache() or we need to be able to clone it.
91+
// We'll do some Arc wrapping in the variants below to ensure that everything is clonable while not
92+
// destroying the source metadata.
93+
#[derive(Error, Debug, Clone)]
94+
pub enum Error {
95+
#[error("error reading environment variable {name}: {source:?}")]
96+
Environment {
97+
name: String,
98+
#[source]
99+
source: Arc<anyhow::Error>,
100+
},
101+
102+
#[error("error getting top crates: {0:?}")]
103+
TopCrates(#[source] Arc<diesel::result::Error>),
104+
}
105+
106+
impl From<diesel::result::Error> for Error {
107+
fn from(value: diesel::result::Error) -> Self {
108+
Self::TopCrates(Arc::new(value))
109+
}
110+
}

0 commit comments

Comments
 (0)