From a90ba89ab26c2a1f5438da7e69521f3c1640ad1e Mon Sep 17 00:00:00 2001
From: DanielFGray <danielfgray@gmail.com>
Date: Sun, 13 Jul 2025 01:10:03 -0500
Subject: [PATCH] user stats compared to cohort closes #167

---
 app/helpers/cohortAnalysis.ts   | 548 ++++++++++++++++++++++++++++++++
 app/models/activity.server.ts   |  18 ++
 app/routes/__auth/dashboard.tsx |  37 ++-
 app/routes/__auth/sh-user.tsx   |  16 +-
 4 files changed, 608 insertions(+), 11 deletions(-)
 create mode 100644 app/helpers/cohortAnalysis.ts

diff --git a/app/helpers/cohortAnalysis.ts b/app/helpers/cohortAnalysis.ts
new file mode 100644
index 0000000..fa4aab8
--- /dev/null
+++ b/app/helpers/cohortAnalysis.ts
@@ -0,0 +1,548 @@
+import { createMessageStatsQuery } from "#~/models/activity.server";
+import { percentile, descriptiveStats } from "#~/helpers/statistics";
+import { sql } from "kysely";
+import { partition } from "lodash-es";
+import type { CodeStats } from "#~/discord/activityTracker.js";
+import { fillDateGaps } from "./dateUtils";
+
+const performanceThresholds = [
+  { min: 90, value: "top" },
+  { min: 70, value: "above_average" },
+  { min: 30, value: "average" },
+  { min: 10, value: "below_average" },
+  { min: -Infinity, value: "bottom" },
+] as const;
+
+type MetricConfig = {
+  key: "messageCount" | "reactionCount" | "codeChars" | "longestStreak";
+  strength: string;
+  improvement: string;
+};
+
+const metricsConfig: MetricConfig[] = [
+  {
+    key: "messageCount",
+    strength: "High message volume",
+    improvement: "Message frequency",
+  },
+  {
+    key: "reactionCount",
+    strength: "Strong community engagement",
+    improvement: "Community engagement",
+  },
+  {
+    key: "codeChars",
+    strength: "Significant code contributions",
+    improvement: "Code sharing",
+  },
+  {
+    key: "longestStreak",
+    strength: "Excellent consistency",
+    improvement: "Activity consistency",
+  },
+] as const;
+
+export interface UserCohortMetrics {
+  userId: string;
+  messageCount: number;
+  wordCount: number;
+  reactionCount: number;
+  codeStats: {
+    totalChars: number;
+    totalLines: number;
+    languageBreakdown: Record<string, number>;
+    topLanguages: Array<{
+      language: string;
+      chars: number;
+      percentage: number;
+    }>;
+  };
+  streakData: {
+    longestStreak: number;
+    currentStreak: number;
+    consistencyScore: number;
+    activeDays: number;
+    totalDays: number;
+  };
+}
+
+export interface CohortBenchmarks {
+  messageCount: PercentileBenchmarks;
+  wordCount: PercentileBenchmarks;
+  reactionCount: PercentileBenchmarks;
+  codeChars: PercentileBenchmarks;
+  codeLines: PercentileBenchmarks;
+  longestStreak: PercentileBenchmarks;
+  consistencyScore: PercentileBenchmarks;
+  languageDistribution: Record<string, PercentileBenchmarks>;
+}
+
+export interface PercentileBenchmarks {
+  p10: number;
+  p25: number;
+  p50: number; // median
+  p75: number;
+  p90: number;
+  p95: number;
+  p99: number;
+  mean: number;
+  stdDev: number;
+  min: number;
+  max: number;
+}
+
+export interface UserCohortComparison {
+  user: UserCohortMetrics;
+  percentiles: {
+    messageCount: number;
+    wordCount: number;
+    reactionCount: number;
+    codeChars: number;
+    codeLines: number;
+    longestStreak: number;
+    consistencyScore: number;
+    topLanguagePercentiles: Record<string, number>;
+  };
+  rankings: {
+    messageCount: { rank: number; total: number };
+    wordCount: { rank: number; total: number };
+    reactionCount: { rank: number; total: number };
+    codeChars: { rank: number; total: number };
+    longestStreak: { rank: number; total: number };
+  };
+  cohortInsights: {
+    overallPerformance:
+      | "top"
+      | "above_average"
+      | "average"
+      | "below_average"
+      | "bottom";
+    strengths: string[];
+    improvementAreas: string[];
+  };
+}
+
+function calculatePercentileBenchmarks(data: number[]): PercentileBenchmarks {
+  if (data.length === 0) {
+    const empty = {
+      p10: 0,
+      p25: 0,
+      p50: 0,
+      p75: 0,
+      p90: 0,
+      p95: 0,
+      p99: 0,
+      mean: 0,
+      stdDev: 0,
+      min: 0,
+      max: 0,
+    };
+    return empty;
+  }
+
+  const stats = descriptiveStats(data);
+
+  return {
+    p10: percentile(data, 0.1),
+    p25: percentile(data, 0.25),
+    p50: percentile(data, 0.5),
+    p75: percentile(data, 0.75),
+    p90: percentile(data, 0.9),
+    p95: percentile(data, 0.95),
+    p99: percentile(data, 0.99),
+    mean: stats.mean,
+    stdDev: stats.standardDeviation,
+    min: stats.min,
+    max: stats.max,
+  };
+}
+
+function calculateUserPercentile(value: number, data: number[]): number {
+  if (data.length === 0) return 0;
+
+  const sortedData = data.slice(0).sort((a, b) => a - b);
+  const rank = sortedData.filter((x) => x <= value).length;
+  return (rank / sortedData.length) * 100;
+}
+
+function calculateStreakData(
+  dailyActivity: Array<{ date: string; messageCount: number }>,
+): UserCohortMetrics["streakData"] {
+  const sortedActivity = dailyActivity.sort((a, b) =>
+    a.date.localeCompare(b.date),
+  );
+
+  let longestStreak = 0;
+  let currentStreak = 0;
+  let tempStreak = 0;
+  let activeDays = 0;
+
+  for (let i = 0; i < sortedActivity.length; i++) {
+    const hasActivity = sortedActivity[i].messageCount > 0;
+
+    if (hasActivity) {
+      activeDays++;
+      tempStreak++;
+      longestStreak = Math.max(longestStreak, tempStreak);
+    } else {
+      tempStreak = 0;
+    }
+  }
+
+  // Calculate current streak from the end
+  for (let i = sortedActivity.length - 1; i >= 0; i--) {
+    if (sortedActivity[i].messageCount > 0) {
+      currentStreak++;
+    } else {
+      break;
+    }
+  }
+
+  const totalDays = sortedActivity.length;
+  const consistencyScore = totalDays > 0 ? (activeDays / totalDays) * 100 : 0;
+
+  return {
+    longestStreak,
+    currentStreak,
+    consistencyScore,
+    activeDays,
+    totalDays,
+  };
+}
+
+function aggregateCodeStats(
+  codeStatsJson: string[],
+): UserCohortMetrics["codeStats"] {
+  const validCodeStats = codeStatsJson.flatMap((jsonStr) => {
+    try {
+      return JSON.parse(jsonStr) as Array<CodeStats>;
+    } catch {
+      return [];
+    }
+  });
+
+  const { totalChars, totalLines, languageBreakdown } = validCodeStats.reduce(
+    (acc, stat) => ({
+      totalChars: acc.totalChars + stat.chars,
+      totalLines: acc.totalLines + stat.lines,
+      languageBreakdown: {
+        ...acc.languageBreakdown,
+        ...(stat.lang && {
+          [stat.lang]: (acc.languageBreakdown[stat.lang] || 0) + stat.chars,
+        }),
+      },
+    }),
+    {
+      totalChars: 0,
+      totalLines: 0,
+      languageBreakdown: {} as Record<string, number>,
+    },
+  );
+
+  const topLanguages = Object.entries(languageBreakdown)
+    .map(([language, chars]) => ({
+      language,
+      chars,
+      percentage: totalChars > 0 ? (chars / totalChars) * 100 : 0,
+    }))
+    .sort((a, b) => b.chars - a.chars)
+    .slice(0, 5);
+
+  return {
+    totalChars,
+    totalLines,
+    languageBreakdown,
+    topLanguages,
+  };
+}
+
+export async function getCohortMetrics(
+  guildId: string,
+  start: string,
+  end: string,
+  minMessageThreshold: number = 10,
+): Promise<UserCohortMetrics[]> {
+  // Get aggregated user data
+  const userStatsQuery = createMessageStatsQuery(guildId, start, end)
+    .select((eb) => [
+      "author_id",
+      eb.fn.count<number>("author_id").as("message_count"),
+      eb.fn.sum<number>("word_count").as("word_count"),
+      eb.fn.sum<number>("react_count").as("reaction_count"),
+      eb.fn("group_concat", ["code_stats"]).as("code_stats_json"),
+      eb
+        .fn("date", [eb("sent_at", "/", eb.lit(1000)), sql.lit("unixepoch")])
+        .as("date"),
+    ])
+    .groupBy("author_id")
+    .having((eb) =>
+      eb(eb.fn.count<number>("author_id"), ">=", minMessageThreshold),
+    );
+
+  const userStats = await userStatsQuery.execute();
+
+  // Get daily activity for streak calculation
+  const dailyActivityQuery = createMessageStatsQuery(guildId, start, end)
+    .select(({ fn, eb, lit }) => [
+      "author_id",
+      fn.count<number>("author_id").as("message_count"),
+      eb
+        .fn("date", [eb("sent_at", "/", lit(1000)), sql.lit("unixepoch")])
+        .as("date"),
+    ])
+    .groupBy(["author_id", "date"])
+    .where(
+      "author_id",
+      "in",
+      userStats.map((u) => u.author_id),
+    );
+
+  const dailyActivity = await dailyActivityQuery.execute();
+
+  // Group daily activity by user
+  const dailyActivityByUser = dailyActivity.reduce(
+    (acc, record) => {
+      const userId = record.author_id;
+      if (!acc[userId]) acc[userId] = [];
+      acc[userId].push({
+        date: record.date as string,
+        messageCount: record.message_count,
+      });
+      return acc;
+    },
+    {} as Record<string, Array<{ date: string; messageCount: number }>>,
+  );
+
+  return userStats.map((user) => {
+    const codeStatsArray = user.code_stats_json
+      ? String(user.code_stats_json).split(",").filter(Boolean)
+      : [];
+
+    const userDailyActivity = fillDateGaps(
+      dailyActivityByUser[user.author_id] || [],
+      start,
+      end,
+      { messageCount: 0 },
+    );
+
+    return {
+      userId: user.author_id,
+      messageCount: user.message_count,
+      wordCount: user.word_count || 0,
+      reactionCount: user.reaction_count || 0,
+      codeStats: aggregateCodeStats(codeStatsArray),
+      streakData: calculateStreakData(userDailyActivity),
+    };
+  });
+}
+
+export function calculateCohortBenchmarks(
+  cohortMetrics: UserCohortMetrics[],
+): CohortBenchmarks {
+  if (cohortMetrics.length === 0) {
+    const empty = {
+      p10: 0,
+      p25: 0,
+      p50: 0,
+      p75: 0,
+      p90: 0,
+      p95: 0,
+      p99: 0,
+      mean: 0,
+      stdDev: 0,
+      min: 0,
+      max: 0,
+    };
+    return {
+      messageCount: empty,
+      wordCount: empty,
+      reactionCount: empty,
+      codeChars: empty,
+      codeLines: empty,
+      longestStreak: empty,
+      consistencyScore: empty,
+      languageDistribution: {},
+    };
+  }
+
+  // Extract arrays for each metric
+  const messageCounts = cohortMetrics.map((u) => u.messageCount);
+  const wordCounts = cohortMetrics.map((u) => u.wordCount);
+  const reactionCounts = cohortMetrics.map((u) => u.reactionCount);
+  const codeChars = cohortMetrics.map((u) => u.codeStats.totalChars);
+  const codeLines = cohortMetrics.map((u) => u.codeStats.totalLines);
+  const longestStreaks = cohortMetrics.map((u) => u.streakData.longestStreak);
+  const consistencyScores = cohortMetrics.map(
+    (u) => u.streakData.consistencyScore,
+  );
+
+  // Calculate language distribution benchmarks
+  const allLanguages = new Set(
+    cohortMetrics.flatMap((user) =>
+      Object.keys(user.codeStats.languageBreakdown),
+    ),
+  );
+
+  const languageDistribution = Array.from(allLanguages).reduce(
+    (acc, language) => {
+      acc[language] = calculatePercentileBenchmarks(
+        cohortMetrics.map((u) => u.codeStats.languageBreakdown[language]),
+      );
+      return acc;
+    },
+    {} as Record<string, PercentileBenchmarks>,
+  );
+
+  return {
+    messageCount: calculatePercentileBenchmarks(messageCounts),
+    wordCount: calculatePercentileBenchmarks(wordCounts),
+    reactionCount: calculatePercentileBenchmarks(reactionCounts),
+    codeChars: calculatePercentileBenchmarks(codeChars),
+    codeLines: calculatePercentileBenchmarks(codeLines),
+    longestStreak: calculatePercentileBenchmarks(longestStreaks),
+    consistencyScore: calculatePercentileBenchmarks(consistencyScores),
+    languageDistribution,
+  };
+}
+
+export function compareUserToCohort(
+  userMetrics: UserCohortMetrics,
+  cohortMetrics: UserCohortMetrics[],
+): UserCohortComparison {
+  // Calculate percentiles
+  const messageCounts = cohortMetrics.map((u) => u.messageCount);
+  const wordCounts = cohortMetrics.map((u) => u.wordCount);
+  const reactionCounts = cohortMetrics.map((u) => u.reactionCount);
+  const codeChars = cohortMetrics.map((u) => u.codeStats.totalChars);
+  const codeLines = cohortMetrics.map((u) => u.codeStats.totalLines);
+  const longestStreaks = cohortMetrics.map((u) => u.streakData.longestStreak);
+  const consistencyScores = cohortMetrics.map(
+    (u) => u.streakData.consistencyScore,
+  );
+
+  const percentiles = {
+    messageCount: calculateUserPercentile(
+      userMetrics.messageCount,
+      messageCounts,
+    ),
+    wordCount: calculateUserPercentile(userMetrics.wordCount, wordCounts),
+    reactionCount: calculateUserPercentile(
+      userMetrics.reactionCount,
+      reactionCounts,
+    ),
+    codeChars: calculateUserPercentile(
+      userMetrics.codeStats.totalChars,
+      codeChars,
+    ),
+    codeLines: calculateUserPercentile(
+      userMetrics.codeStats.totalLines,
+      codeLines,
+    ),
+    longestStreak: calculateUserPercentile(
+      userMetrics.streakData.longestStreak,
+      longestStreaks,
+    ),
+    consistencyScore: calculateUserPercentile(
+      userMetrics.streakData.consistencyScore,
+      consistencyScores,
+    ),
+    // Calculate language percentiles for user's top languages
+    topLanguagePercentiles: userMetrics.codeStats.topLanguages.reduce(
+      (acc, { language }) => {
+        acc[language] = calculateUserPercentile(
+          userMetrics.codeStats.languageBreakdown[language] || 0,
+          cohortMetrics.map(
+            (u) => u.codeStats.languageBreakdown[language] || 0,
+          ),
+        );
+        return acc;
+      },
+      {} as Record<string, number>,
+    ),
+  };
+
+  // Calculate rankings
+  const rankings = {
+    messageCount: {
+      rank:
+        messageCounts.filter((count) => count > userMetrics.messageCount)
+          .length + 1,
+      total: messageCounts.length,
+    },
+    wordCount: {
+      rank:
+        wordCounts.filter((count) => count > userMetrics.wordCount).length + 1,
+      total: wordCounts.length,
+    },
+    reactionCount: {
+      rank:
+        reactionCounts.filter((count) => count > userMetrics.reactionCount)
+          .length + 1,
+      total: reactionCounts.length,
+    },
+    codeChars: {
+      rank:
+        codeChars.filter((chars) => chars > userMetrics.codeStats.totalChars)
+          .length + 1,
+      total: codeChars.length,
+    },
+    longestStreak: {
+      rank:
+        longestStreaks.filter(
+          (streak) => streak > userMetrics.streakData.longestStreak,
+        ).length + 1,
+      total: longestStreaks.length,
+    },
+  };
+
+  // Generate insights
+  const avgPercentile =
+    (percentiles.messageCount +
+      percentiles.wordCount +
+      percentiles.reactionCount +
+      percentiles.longestStreak) /
+    4;
+
+  const overallPerformance = performanceThresholds.find(
+    (t) => avgPercentile >= t.min,
+  )!.value;
+
+  const [strengthConfigs, improvementConfigs] = partition(
+    metricsConfig,
+    (config) => percentiles[config.key] >= 50,
+  );
+
+  const strengths = strengthConfigs.map((config) => config.strength);
+  const improvementAreas = improvementConfigs.map(
+    (config) => config.improvement,
+  );
+
+  return {
+    user: userMetrics,
+    percentiles,
+    rankings,
+    cohortInsights: {
+      overallPerformance,
+      strengths,
+      improvementAreas,
+    },
+  };
+}
+
+export async function getUserCohortAnalysis(
+  guildId: string,
+  userId: string,
+  start: string,
+  end: string,
+  minMessageThreshold: number = 10,
+) {
+  const cohortMetrics = await getCohortMetrics(
+    guildId,
+    start,
+    end,
+    minMessageThreshold,
+  );
+  const userMetrics = cohortMetrics.find((u) => u.userId === userId);
+  if (!userMetrics) return null;
+  return compareUserToCohort(userMetrics, cohortMetrics);
+}
diff --git a/app/models/activity.server.ts b/app/models/activity.server.ts
index 29f5aec..a78d8d0 100644
--- a/app/models/activity.server.ts
+++ b/app/models/activity.server.ts
@@ -3,6 +3,7 @@ import db from "#~/db.server";
 import { getOrFetchUser } from "#~/helpers/userInfoCache.js";
 import { fillDateGaps } from "#~/helpers/dateUtils";
 import { sql } from "kysely";
+import { getUserCohortAnalysis } from "#~/helpers/cohortAnalysis";
 
 type MessageStats = DB["message_stats"];
 
@@ -134,6 +135,23 @@ export async function getUserMessageAnalytics(
   return { dailyBreakdown, categoryBreakdown, channelBreakdown, userInfo };
 }
 
+export async function getEnhancedUserAnalytics(
+  guildId: string,
+  userId: string,
+  start: string,
+  end: string,
+) {
+  const [basicAnalytics, cohortComparison] = await Promise.all([
+    getUserMessageAnalytics(guildId, userId, start, end),
+    getUserCohortAnalysis(guildId, userId, start, end),
+  ]);
+
+  return {
+    ...basicAnalytics,
+    cohortComparison,
+  };
+}
+
 export async function getTopParticipants(
   guildId: MessageStats["guild_id"],
   intervalStart: string,
diff --git a/app/routes/__auth/dashboard.tsx b/app/routes/__auth/dashboard.tsx
index 4c43508..46c52c1 100644
--- a/app/routes/__auth/dashboard.tsx
+++ b/app/routes/__auth/dashboard.tsx
@@ -2,6 +2,10 @@ import type { Route } from "./+types/dashboard";
 import { data, useSearchParams, Link } from "react-router";
 import type { LabelHTMLAttributes, PropsWithChildren } from "react";
 import { getTopParticipants } from "#~/models/activity.server";
+import {
+  getCohortMetrics,
+  calculateCohortBenchmarks,
+} from "#~/helpers/cohortAnalysis";
 
 export async function loader({ params, request }: Route.LoaderArgs) {
   // const user = await getUser(request);
@@ -9,14 +13,23 @@ export async function loader({ params, request }: Route.LoaderArgs) {
   const start = url.searchParams.get("start");
   const end = url.searchParams.get("end");
   const guildId = params.guildId;
+  const minThreshold = Number(url.searchParams.get("minThreshold") || 10);
 
   if (!(guildId && start && end)) {
     return data(null, { status: 400 });
   }
 
-  const output = await getTopParticipants(guildId, start, end);
+  const userResults = await getTopParticipants(guildId, start, end);
 
-  return output;
+  // Return full cohort metrics and benchmarks
+  const cohortMetrics = await getCohortMetrics(
+    guildId,
+    start,
+    end,
+    minThreshold,
+  );
+  const benchmarks = calculateCohortBenchmarks(cohortMetrics);
+  return { cohortMetrics, benchmarks, userResults };
 }
 
 const Label = (props: LabelHTMLAttributes<Element>) => (
@@ -54,15 +67,13 @@ const DataHeading = ({ children }: PropsWithChildren) => {
   );
 };
 
-export default function DashboardPage({
-  loaderData: data,
-}: Route.ComponentProps) {
+export default function DashboardPage({ loaderData }: Route.ComponentProps) {
   const [qs] = useSearchParams();
 
   const start = qs.get("start") ?? undefined;
   const end = qs.get("end") ?? undefined;
 
-  if (!data) {
+  if (!loaderData) {
     return (
       <div className="h-full px-6 py-8">
         <div className="flex justify-center">
@@ -73,15 +84,25 @@ export default function DashboardPage({
     );
   }
 
+  const { userResults, cohortMetrics, benchmarks } = loaderData;
+
   return (
     <div className="px-6 py-8">
       <div className="flex justify-center">
         <RangeForm values={{ start, end }} />
       </div>
       <div>
+        <textarea readOnly className="resize text-black">
+          {JSON.stringify({ benchmarks }, null, 2)}
+        </textarea>
+        <textarea readOnly className="resize text-black">
+          {JSON.stringify({ cohortMetrics }, null, 2)}
+        </textarea>
+
         <textarea
+          className="resize text-black"
           defaultValue={`Author ID,Percent Zero Days,Word Count,Message Count,Channel Count,Category Count,Reaction Count,Word Score,Message Score,Channel Score,Consistency Score
-${data
+${userResults
   .map(
     (d) =>
       `${d.data.member.author_id},${d.metadata.percentZeroDays},${d.data.member.total_word_count},${d.data.member.message_count},${d.data.member.channel_count},${d.data.member.category_count},${d.data.member.total_reaction_count},${d.score.wordScore},${d.score.messageScore},${d.score.channelScore},${d.score.consistencyScore}`,
@@ -105,7 +126,7 @@ ${data
             </tr>
           </thead>
           <tbody>
-            {data.map((d) => (
+            {userResults.map((d) => (
               <tr key={d.data.member.author_id}>
                 <td>
                   <Link
diff --git a/app/routes/__auth/sh-user.tsx b/app/routes/__auth/sh-user.tsx
index df9870b..8780553 100644
--- a/app/routes/__auth/sh-user.tsx
+++ b/app/routes/__auth/sh-user.tsx
@@ -20,6 +20,7 @@ import {
 } from "recharts";
 import { useMemo } from "react";
 import { getUserMessageAnalytics } from "#~/models/activity.server";
+import { getUserCohortAnalysis } from "#~/helpers/cohortAnalysis.js";
 
 export async function loader({ request, params }: LoaderFunctionArgs) {
   const { guildId, userId } = params;
@@ -30,13 +31,21 @@ export async function loader({ request, params }: LoaderFunctionArgs) {
   const url = new URL(request.url);
   const start = url.searchParams.get("start");
   const end = url.searchParams.get("end");
+  const minThreshold = Number(url.searchParams.get("minThreshold") || 10);
 
   if (!start || !end) {
     throw new Error("cannot load data without start and end range");
   }
 
-  // Use shared analytics function with channel filtering disabled for user view
-  return await getUserMessageAnalytics(guildId, userId, start, end);
+  const [analysis, data] = await Promise.all([
+    getUserCohortAnalysis(guildId, userId, start, end, minThreshold),
+    getUserMessageAnalytics(guildId, userId, start, end),
+  ]);
+
+  return {
+    analysis,
+    data,
+  };
 }
 
 const num = new Intl.NumberFormat("en-US", {
@@ -46,7 +55,7 @@ const num = new Intl.NumberFormat("en-US", {
 
 export default function UserProfile({
   params,
-  loaderData: data,
+  loaderData: { data, analysis },
 }: Route.ComponentProps) {
   const [qs] = useSearchParams();
   const start = qs.get("start");
@@ -128,6 +137,7 @@ text {
           </p>
           <p></p>
           <p>Received {num.format(derivedData.totalReactions)} reactions.</p>
+          <pre>{JSON.stringify(analysis, null, 2)}</pre>
         </div>
         <div className="mx-auto max-w-screen-lg">
           <ResponsiveContainer width="100%" height={200}>