From 6a3818af6676273f34282b4d27a7effe59cce529 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Sun, 6 Feb 2022 21:18:29 -0800 Subject: [PATCH 01/11] WIP: Add some basic analysis to index. Working towards an interactive dashboard. --- site/index.md | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/site/index.md b/site/index.md index 3e5b773..e7867bf 100644 --- a/site/index.md +++ b/site/index.md @@ -1,3 +1,16 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.6 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + Scientific Python Devstats ========================== @@ -10,3 +23,87 @@ ecosystem. project_reports ``` + +% TODO: automate project generation based on which data files are in devstats-data + +```{code-cell} ipython3 +--- +tags: [remove-cell] +--- +import json +import datetime +import itertools +from dateutil.parser import isoparse +import numpy as np +import matplotlib.pyplot as plt + +projects = [ + "numpy", "scipy", "matplotlib", "pandas", "scikit-learn", "scikit-image", "networkx" +] + +project_prs = dict() +for proj in projects: + with open(f"../devstats-data/{proj}_prs.json") as fh: + data = [item["node"] for item in json.loads(fh.read())] + + # Only consider prs to the main development branch + default_branches = {"main", "master"} + prs = [pr for pr in data if pr["baseRefName"] in default_branches] + + # Ignore PRs with unknown author + prs = [pr for pr in prs if pr["author"]] # Failed author query results in None + + # Ignore bots + bot_filter = {"dependabot-preview"} + prs = [pr for pr in prs if pr["author"]["login"] not in bot_filter] + + # Split into merged and open + merged_prs = [pr for pr in prs if pr["state"] == "MERGED"] + open_prs = [pr for pr in prs if pr["state"] == "OPEN"] + + # Only look at PRs that have been created or merged in the last year + today = np.datetime64(datetime.datetime.now(), "D") + year = np.timedelta64(365, "D") + merged_prs = [ + pr for pr in merged_prs + if (today - np.datetime64(pr["mergedAt"], "D")) < year + ] + open_prs = [ + pr for pr in open_prs + if (today - np.datetime64(pr["createdAt"], "D")) < year + ] + + project_prs[proj] = { + "open_prs" : open_prs, + "merged_prs" : merged_prs, + } +``` + +```{code-cell} ipython3 +--- +tags: [remove-input] +--- +# Num merged PRs per month +start_date = today - year +bedges = np.array( + [start_date + i * np.timedelta64(30, "D") for i in range(13)], dtype=np.datetime64 +) +# Proxy date for center of bin +x = bedges[:-1] + np.timedelta64(15, "D") + +fig, ax = plt.subplots(figsize=(16, 12)) +ax.set_title("Merged PRs", fontsize=24) + +# NOTE: np.histogram doesn't work on datetimes +for proj, data in project_prs.items(): + merged_prs = data["merged_prs"] + merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") + num_merged_per_month = [] + for lo, hi in itertools.pairwise(bedges): + num_merged_per_month.append( + sum(1 for md in merge_dates if md > lo and md < hi) + ) + ax.plot(x, num_merged_per_month, label=proj) +ax.legend() +plt.show() +``` From 9d63096209f7f19a3279a30d87ff3a03e3c74a4a Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Sun, 6 Feb 2022 21:46:23 -0800 Subject: [PATCH 02/11] WIP: Use bokeh instead of matplotlib. --- site/index.md | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/site/index.md b/site/index.md index e7867bf..85fa034 100644 --- a/site/index.md +++ b/site/index.md @@ -24,6 +24,18 @@ ecosystem. project_reports ``` +% TODO: Figure out why bokeh won't render when output_notebook is in a hidden cell + +```{code-cell} ipython3 +--- +tags: [] +--- +# For interactive plots +from bokeh.plotting import figure, show, output_notebook +from bokeh.palettes import Category10_10 as palette +output_notebook() +``` + % TODO: automate project generation based on which data files are in devstats-data ```{code-cell} ipython3 @@ -81,7 +93,7 @@ for proj in projects: ```{code-cell} ipython3 --- -tags: [remove-input] +tags: [remove-cell] --- # Num merged PRs per month start_date = today - year @@ -91,10 +103,8 @@ bedges = np.array( # Proxy date for center of bin x = bedges[:-1] + np.timedelta64(15, "D") -fig, ax = plt.subplots(figsize=(16, 12)) -ax.set_title("Merged PRs", fontsize=24) - # NOTE: np.histogram doesn't work on datetimes +merged_prs_per_month = dict() for proj, data in project_prs.items(): merged_prs = data["merged_prs"] merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") @@ -103,7 +113,20 @@ for proj, data in project_prs.items(): num_merged_per_month.append( sum(1 for md in merge_dates if md > lo and md < hi) ) - ax.plot(x, num_merged_per_month, label=proj) -ax.legend() -plt.show() + merged_prs_per_month[proj] = num_merged_per_month +``` + +```{code-cell} ipython3 +--- +tags: [remove-input] +--- +p = figure( + width=400, + height=400, + title="Merged PRs per month", + x_axis_type="datetime", +) +for (label, y), color in zip(merged_prs_per_month.items(), itertools.cycle(palette)): + p.line(x, y, color=color, legend_label=label) +show(p) ``` From f45926beb5cd3183ba82bf0d4b76ebde26841cfd Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Sun, 6 Feb 2022 21:56:37 -0800 Subject: [PATCH 03/11] WIP: Clean up legend a bit. --- site/index.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/site/index.md b/site/index.md index 85fa034..793ce10 100644 --- a/site/index.md +++ b/site/index.md @@ -33,6 +33,7 @@ tags: [] # For interactive plots from bokeh.plotting import figure, show, output_notebook from bokeh.palettes import Category10_10 as palette +from bokeh.models import Legend output_notebook() ``` @@ -121,12 +122,18 @@ for proj, data in project_prs.items(): tags: [remove-input] --- p = figure( - width=400, + width=600, height=400, title="Merged PRs per month", x_axis_type="datetime", ) + +legend_items = [] for (label, y), color in zip(merged_prs_per_month.items(), itertools.cycle(palette)): - p.line(x, y, color=color, legend_label=label) + l = p.line(x, y, color=color) + legend_items.append((label, [l])) + +legend = Legend(items=legend_items, orientation="horizontal") +p.add_layout(legend, "below") show(p) ``` From 8dd2d17e77547df7816e8497ec961542626363d1 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Sun, 6 Feb 2022 22:03:48 -0800 Subject: [PATCH 04/11] Toggle visibility of plots via clickable legend. --- site/index.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/site/index.md b/site/index.md index 793ce10..948ea57 100644 --- a/site/index.md +++ b/site/index.md @@ -122,7 +122,7 @@ for proj, data in project_prs.items(): tags: [remove-input] --- p = figure( - width=600, + width=650, height=400, title="Merged PRs per month", x_axis_type="datetime", @@ -130,10 +130,11 @@ p = figure( legend_items = [] for (label, y), color in zip(merged_prs_per_month.items(), itertools.cycle(palette)): - l = p.line(x, y, color=color) + l = p.line(x, y, line_width=2, color=color, muted_alpha=0.2) legend_items.append((label, [l])) legend = Legend(items=legend_items, orientation="horizontal") +legend.click_policy = "mute" p.add_layout(legend, "below") show(p) ``` From 31e905da71f07906f7eaad4a524930276d6ac585 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Sun, 6 Feb 2022 21:18:29 -0800 Subject: [PATCH 05/11] WIP: Add some basic analysis to index. Working towards an interactive dashboard. --- site/index.md | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/site/index.md b/site/index.md index 948ea57..fd86693 100644 --- a/site/index.md +++ b/site/index.md @@ -138,3 +138,87 @@ legend.click_policy = "mute" p.add_layout(legend, "below") show(p) ``` + +% TODO: automate project generation based on which data files are in devstats-data + +```{code-cell} ipython3 +--- +tags: [remove-cell] +--- +import json +import datetime +import itertools +from dateutil.parser import isoparse +import numpy as np +import matplotlib.pyplot as plt + +projects = [ + "numpy", "scipy", "matplotlib", "pandas", "scikit-learn", "scikit-image", "networkx" +] + +project_prs = dict() +for proj in projects: + with open(f"../devstats-data/{proj}_prs.json") as fh: + data = [item["node"] for item in json.loads(fh.read())] + + # Only consider prs to the main development branch + default_branches = {"main", "master"} + prs = [pr for pr in data if pr["baseRefName"] in default_branches] + + # Ignore PRs with unknown author + prs = [pr for pr in prs if pr["author"]] # Failed author query results in None + + # Ignore bots + bot_filter = {"dependabot-preview"} + prs = [pr for pr in prs if pr["author"]["login"] not in bot_filter] + + # Split into merged and open + merged_prs = [pr for pr in prs if pr["state"] == "MERGED"] + open_prs = [pr for pr in prs if pr["state"] == "OPEN"] + + # Only look at PRs that have been created or merged in the last year + today = np.datetime64(datetime.datetime.now(), "D") + year = np.timedelta64(365, "D") + merged_prs = [ + pr for pr in merged_prs + if (today - np.datetime64(pr["mergedAt"], "D")) < year + ] + open_prs = [ + pr for pr in open_prs + if (today - np.datetime64(pr["createdAt"], "D")) < year + ] + + project_prs[proj] = { + "open_prs" : open_prs, + "merged_prs" : merged_prs, + } +``` + +```{code-cell} ipython3 +--- +tags: [remove-input] +--- +# Num merged PRs per month +start_date = today - year +bedges = np.array( + [start_date + i * np.timedelta64(30, "D") for i in range(13)], dtype=np.datetime64 +) +# Proxy date for center of bin +x = bedges[:-1] + np.timedelta64(15, "D") + +fig, ax = plt.subplots(figsize=(16, 12)) +ax.set_title("Merged PRs", fontsize=24) + +# NOTE: np.histogram doesn't work on datetimes +for proj, data in project_prs.items(): + merged_prs = data["merged_prs"] + merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") + num_merged_per_month = [] + for lo, hi in itertools.pairwise(bedges): + num_merged_per_month.append( + sum(1 for md in merge_dates if md > lo and md < hi) + ) + ax.plot(x, num_merged_per_month, label=proj) +ax.legend() +plt.show() +``` From 1df7f94498943fa99ec8f065351b968ccd9b50aa Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Sun, 6 Feb 2022 21:46:23 -0800 Subject: [PATCH 06/11] WIP: Use bokeh instead of matplotlib. --- site/index.md | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/site/index.md b/site/index.md index fd86693..bfe8759 100644 --- a/site/index.md +++ b/site/index.md @@ -139,6 +139,18 @@ p.add_layout(legend, "below") show(p) ``` +% TODO: Figure out why bokeh won't render when output_notebook is in a hidden cell + +```{code-cell} ipython3 +--- +tags: [] +--- +# For interactive plots +from bokeh.plotting import figure, show, output_notebook +from bokeh.palettes import Category10_10 as palette +output_notebook() +``` + % TODO: automate project generation based on which data files are in devstats-data ```{code-cell} ipython3 @@ -196,7 +208,7 @@ for proj in projects: ```{code-cell} ipython3 --- -tags: [remove-input] +tags: [remove-cell] --- # Num merged PRs per month start_date = today - year @@ -206,10 +218,8 @@ bedges = np.array( # Proxy date for center of bin x = bedges[:-1] + np.timedelta64(15, "D") -fig, ax = plt.subplots(figsize=(16, 12)) -ax.set_title("Merged PRs", fontsize=24) - # NOTE: np.histogram doesn't work on datetimes +merged_prs_per_month = dict() for proj, data in project_prs.items(): merged_prs = data["merged_prs"] merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") @@ -218,7 +228,20 @@ for proj, data in project_prs.items(): num_merged_per_month.append( sum(1 for md in merge_dates if md > lo and md < hi) ) - ax.plot(x, num_merged_per_month, label=proj) -ax.legend() -plt.show() + merged_prs_per_month[proj] = num_merged_per_month +``` + +```{code-cell} ipython3 +--- +tags: [remove-input] +--- +p = figure( + width=400, + height=400, + title="Merged PRs per month", + x_axis_type="datetime", +) +for (label, y), color in zip(merged_prs_per_month.items(), itertools.cycle(palette)): + p.line(x, y, color=color, legend_label=label) +show(p) ``` From 391122db823993ffa8c368bb13f0e482c33b7667 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 7 Feb 2022 13:51:02 -0800 Subject: [PATCH 07/11] Update merged_pr analysis with array semantics. --- site/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/site/index.md b/site/index.md index bfe8759..6bd480d 100644 --- a/site/index.md +++ b/site/index.md @@ -107,13 +107,13 @@ x = bedges[:-1] + np.timedelta64(15, "D") # NOTE: np.histogram doesn't work on datetimes merged_prs_per_month = dict() for proj, data in project_prs.items(): - merged_prs = data["merged_prs"] + # Num merged PRs per month + merged_prs = np.array(data["merged_prs"], dtype=object) merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") num_merged_per_month = [] for lo, hi in itertools.pairwise(bedges): - num_merged_per_month.append( - sum(1 for md in merge_dates if md > lo and md < hi) - ) + month_mask = (merge_dates < hi) & (merge_dates > lo) + num_merged_per_month.append(month_mask.sum()) merged_prs_per_month[proj] = num_merged_per_month ``` From ec51482eb70438d184d2cae671b100bc7d2ae138 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 7 Feb 2022 13:59:23 -0800 Subject: [PATCH 08/11] Add analysis of number of uniq mergers per month. --- site/index.md | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/site/index.md b/site/index.md index 6bd480d..0b5e2f2 100644 --- a/site/index.md +++ b/site/index.md @@ -106,15 +106,25 @@ x = bedges[:-1] + np.timedelta64(15, "D") # NOTE: np.histogram doesn't work on datetimes merged_prs_per_month = dict() +uniq_mergers_per_month = dict() for proj, data in project_prs.items(): # Num merged PRs per month merged_prs = np.array(data["merged_prs"], dtype=object) merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") num_merged_per_month = [] + uniq_mergers = [] for lo, hi in itertools.pairwise(bedges): month_mask = (merge_dates < hi) & (merge_dates > lo) + + # Number of PRs merged per month num_merged_per_month.append(month_mask.sum()) + + # Number of unique maintainers who merged a PR in a given month + mergers = {pr["mergedBy"]["login"] for pr in merged_prs[month_mask]} + uniq_mergers.append(len(mergers)) + merged_prs_per_month[proj] = num_merged_per_month + uniq_mergers_per_month[proj] = uniq_mergers ``` ```{code-cell} ipython3 @@ -236,12 +246,19 @@ for proj, data in project_prs.items(): tags: [remove-input] --- p = figure( - width=400, + width=650, height=400, - title="Merged PRs per month", + title="Number of unique maintainers who merged at least 1 PR", x_axis_type="datetime", ) -for (label, y), color in zip(merged_prs_per_month.items(), itertools.cycle(palette)): - p.line(x, y, color=color, legend_label=label) + +legend_items = [] +for (label, y), color in zip(uniq_mergers_per_month.items(), itertools.cycle(palette)): + l = p.line(x, y, line_width=2, color=color, muted_alpha=0.2) + legend_items.append((label, [l])) + +legend = Legend(items=legend_items, orientation="horizontal") +legend.click_policy = "mute" +p.add_layout(legend, "below") show(p) ``` From a24ab008e7e4de3a705322074d595560b1e37847 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 8 Feb 2022 20:36:12 -0800 Subject: [PATCH 09/11] Add avg num PRs merged per maintainer. --- site/index.md | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/site/index.md b/site/index.md index 0b5e2f2..f03e356 100644 --- a/site/index.md +++ b/site/index.md @@ -123,8 +123,8 @@ for proj, data in project_prs.items(): mergers = {pr["mergedBy"]["login"] for pr in merged_prs[month_mask]} uniq_mergers.append(len(mergers)) - merged_prs_per_month[proj] = num_merged_per_month - uniq_mergers_per_month[proj] = uniq_mergers + merged_prs_per_month[proj] = np.array(num_merged_per_month) + uniq_mergers_per_month[proj] = np.array(uniq_mergers) ``` ```{code-cell} ipython3 @@ -262,3 +262,25 @@ legend.click_policy = "mute" p.add_layout(legend, "below") show(p) ``` + +```{code-cell} ipython3 +--- +tags: [remove-input] +--- +p = figure( + width=650, + height=400, + title="Avg # PRs merged per maintainer", + x_axis_type="datetime", +) + +legend_items = [] +for (label, y), (_, n), color in zip(merged_prs_per_month.items(), uniq_mergers_per_month.items(), itertools.cycle(palette)): + l = p.line(x, y / n, line_width=2, color=color, muted_alpha=0.2) + legend_items.append((label, [l])) + +legend = Legend(items=legend_items, orientation="horizontal") +legend.click_policy = "mute" +p.add_layout(legend, "below") +show(p) +``` From 104856851e54915e4652709a27975609d59262ee Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 8 Feb 2022 20:45:21 -0800 Subject: [PATCH 10/11] Botched rebase cleanup. --- site/index.md | 92 --------------------------------------------------- 1 file changed, 92 deletions(-) diff --git a/site/index.md b/site/index.md index f03e356..b62e2b7 100644 --- a/site/index.md +++ b/site/index.md @@ -149,98 +149,6 @@ p.add_layout(legend, "below") show(p) ``` -% TODO: Figure out why bokeh won't render when output_notebook is in a hidden cell - -```{code-cell} ipython3 ---- -tags: [] ---- -# For interactive plots -from bokeh.plotting import figure, show, output_notebook -from bokeh.palettes import Category10_10 as palette -output_notebook() -``` - -% TODO: automate project generation based on which data files are in devstats-data - -```{code-cell} ipython3 ---- -tags: [remove-cell] ---- -import json -import datetime -import itertools -from dateutil.parser import isoparse -import numpy as np -import matplotlib.pyplot as plt - -projects = [ - "numpy", "scipy", "matplotlib", "pandas", "scikit-learn", "scikit-image", "networkx" -] - -project_prs = dict() -for proj in projects: - with open(f"../devstats-data/{proj}_prs.json") as fh: - data = [item["node"] for item in json.loads(fh.read())] - - # Only consider prs to the main development branch - default_branches = {"main", "master"} - prs = [pr for pr in data if pr["baseRefName"] in default_branches] - - # Ignore PRs with unknown author - prs = [pr for pr in prs if pr["author"]] # Failed author query results in None - - # Ignore bots - bot_filter = {"dependabot-preview"} - prs = [pr for pr in prs if pr["author"]["login"] not in bot_filter] - - # Split into merged and open - merged_prs = [pr for pr in prs if pr["state"] == "MERGED"] - open_prs = [pr for pr in prs if pr["state"] == "OPEN"] - - # Only look at PRs that have been created or merged in the last year - today = np.datetime64(datetime.datetime.now(), "D") - year = np.timedelta64(365, "D") - merged_prs = [ - pr for pr in merged_prs - if (today - np.datetime64(pr["mergedAt"], "D")) < year - ] - open_prs = [ - pr for pr in open_prs - if (today - np.datetime64(pr["createdAt"], "D")) < year - ] - - project_prs[proj] = { - "open_prs" : open_prs, - "merged_prs" : merged_prs, - } -``` - -```{code-cell} ipython3 ---- -tags: [remove-cell] ---- -# Num merged PRs per month -start_date = today - year -bedges = np.array( - [start_date + i * np.timedelta64(30, "D") for i in range(13)], dtype=np.datetime64 -) -# Proxy date for center of bin -x = bedges[:-1] + np.timedelta64(15, "D") - -# NOTE: np.histogram doesn't work on datetimes -merged_prs_per_month = dict() -for proj, data in project_prs.items(): - merged_prs = data["merged_prs"] - merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") - num_merged_per_month = [] - for lo, hi in itertools.pairwise(bedges): - num_merged_per_month.append( - sum(1 for md in merge_dates if md > lo and md < hi) - ) - merged_prs_per_month[proj] = num_merged_per_month -``` - ```{code-cell} ipython3 --- tags: [remove-input] From f0ce0fd1f7fd267d0e78a20a09df36e4267f2051 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Wed, 9 Feb 2022 14:18:01 -0800 Subject: [PATCH 11/11] Replace itertools.pairwise. --- site/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/index.md b/site/index.md index b62e2b7..ec055ed 100644 --- a/site/index.md +++ b/site/index.md @@ -113,7 +113,7 @@ for proj, data in project_prs.items(): merge_dates = np.array([pr["mergedAt"] for pr in merged_prs], dtype="M8[D]") num_merged_per_month = [] uniq_mergers = [] - for lo, hi in itertools.pairwise(bedges): + for lo, hi in zip(bedges[:-1], bedges[1:]): month_mask = (merge_dates < hi) & (merge_dates > lo) # Number of PRs merged per month