jeromedockes · koudyk · Oct 25, 2023 · Oct 25, 2023
diff --git a/scripts/lineplot_n-over-time_diff-sources.py b/scripts/lineplot_n-over-time_diff-sources.py
@@ -0,0 +1,102 @@
+"""Plot median number of participants through time from multiple sources.
+
+The plot shows the median for each year from 3 sources:
+- David & al annotations distributed in https://github.com/poldracklab/ScanningTheHorizon
+- Annotations of NeuroSynth abstracts distributed in https://github.com/poldracklab/ScanningTheHorizon
+- Sample sizes automatically extracted from pubget data.
+"""
+from pathlib import Path
+
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+import pandas as pd
+
+import utils
+
+MIN_PAPERS = 20
+NS_NAME = "Poldrack & al. [2017]"
+DAVID_NAME = "David & al. [2013]"
+PUBGET_NAME = "pubextract heuristic [in 2023]"
+GPT_NAME = "GPT-3 [in 2023]"
+
+np.random.seed(0)
+
+demographics_file = (
+    Path(__file__).resolve().parents[1]
+    / "data"
+    / "outputs"
+    / "n_participants_full_dataset.csv"
+)
+demographics_data = pd.read_csv(demographics_file)
+# restrict to single-group studies
+demographics_data = demographics_data[demographics_data["n_groups"] == 1]
+demographics_data = demographics_data.loc[:, ["publication_year", "count"]]
+demographics_data["Data source"] = PUBGET_NAME
+demographics_data["publication_year"] = pd.to_datetime(
+    pd.DataFrame({"year": demographics_data["publication_year"], "month": 1, "day": 1})
+)
+year_counts = demographics_data["publication_year"].value_counts()
+years_with_too_few = year_counts[year_counts < MIN_PAPERS].index.values
+demographics_data = demographics_data[~demographics_data["publication_year"].isin(years_with_too_few)]
+
+neurosynth_data = utils.load_neurosynth_sample_sizes().loc[
+    :, ["publication_year", "count"]
+]
+neurosynth_data["Data source"] = NS_NAME
+
+david_data = utils.load_david_sample_sizes().loc[
+    :, ["publication_year", "count"]
+]
+david_data["Data source"] = DAVID_NAME
+
+gpt_data = utils.load_gpt_sample_sizes().loc[
+    :, ["publication_year", "count", "pmcid"]
+]
+# restrict to single-group studies
+for pmcid, group in gpt_data.groupby("pmcid"):
+    gpt_data.loc[group.index, "n_groups"] = len(group)
+gpt_data = gpt_data[gpt_data["n_groups"] == 1]
+gpt_data = gpt_data.loc[:, ["publication_year", "count"]]
+gpt_data["Data source"] = GPT_NAME
+# get rid of years with too few papers
+year_counts = gpt_data["publication_year"].value_counts()
+years_with_too_few = year_counts[year_counts < MIN_PAPERS].index.values
+gpt_data = gpt_data[~gpt_data["publication_year"].isin(years_with_too_few)]
+
+data = pd.concat(
+    [
+        demographics_data,
+        neurosynth_data,
+        david_data,
+        gpt_data
+    ], axis=0, ignore_index=True
+)
+
+
+fig, ax = plt.subplots(figsize=(9, 5))
+ax.grid(which='major', axis='y', color='gray', alpha=.3)
+percentile = 50
+sns.lineplot(
+    data=data,
+    x="publication_year",
+    y="count",
+    hue="Data source",
+    hue_order=(DAVID_NAME, NS_NAME, PUBGET_NAME, GPT_NAME),
+    palette=np.asarray(utils.TAB10_COLORS[:4])[[2, 0, 1, 3]],
+    style="Data source",
+    estimator=lambda x: np.percentile(x, percentile),
+    ax=ax,
+)
+ax.set_xlabel("Publication year")
+ax.set_ylabel("Median sample size")
+ax.set_ylim(0, 40)
+ax.set_xlim(data["publication_year"].min(), data["publication_year"].max())
+# ax.legend(loc="upper left", frameon=False)
+
+sns.move_legend(ax, "upper left")
+
+sns.despine()
+
+output_file = utils.get_figures_dir() / "lineplot_n-over-time_diff-sources.pdf"
+fig.savefig(output_file, bbox_inches="tight")
diff --git a/scripts/scatterplot_truth-vs-gpt-and-heuristic.py b/scripts/scatterplot_truth-vs-gpt-and-heuristic.py
@@ -0,0 +1,139 @@
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+import pandas as pd
+
+import utils
+
+
+np.random.seed(0)
+
+# heuristic data
+demographics_file = utils.get_outputs_dir() / "n_participants_full_dataset.csv"
+heuristic_data = pd.read_csv(demographics_file).set_index("pmcid")
+heuristic_data["heuristic"] = heuristic_data["count"]
+
+#  gpt data
+gpt_file = (
+    utils.get_outputs_dir()
+    # / "all_documents_participant_demographics_gpt_tokens-4000_clean.csv"
+    / "eval_participant_demographics_gpt_tokens-2000_clean.csv"
+)
+gpt_data_original = pd.read_csv(gpt_file)
+gpt_data = pd.DataFrame(index=gpt_data_original["pmcid"].unique())
+# calculate total count for multi-group studies
+for pmcid, group in gpt_data_original.groupby("pmcid"):
+    gpt_data.loc[pmcid, "gpt"] = group["count"].sum()
+    gpt_data.loc[pmcid, "n_groups"] = len(group)
+
+# ground truth
+truth_file = utils.get_outputs_dir() / "evaluation_labels.csv"
+truth_data_original = pd.read_csv(truth_file)
+truth_data = pd.DataFrame(index=truth_data_original["pmcid"].unique())
+for pmcid, group in truth_data_original.groupby("pmcid"):
+    truth_data.loc[pmcid, "truth"] = group["count"].sum()
+    truth_data.loc[pmcid, "n_groups"] = len(group)
+
+# concatenate data to compare results
+data_full = pd.concat(
+    [
+        heuristic_data,
+        gpt_data,
+        truth_data,
+    ],
+    axis=1,
+    join="outer",
+)
+# data_full.to_csv('TEMP.csv')
+
+data = data_full.loc[:, ["heuristic", "gpt", "truth"]]
+data = data[data.index.isin(truth_data.index)]
+
+# evaluate results
+hr_guessed = len(data[data["heuristic"] > -1])/len(data) * 100
+gpt_guessed = len(data[data["gpt"] > -1])/len(data) * 100
+print(f'\nheuristic made a guess in {hr_guessed} % of evaluation papers')
+print(f'GPT made a guess in {gpt_guessed} % of evaluation papers\n')
+
+hr_mape = np.nanmedian(np.abs(data.heuristic - data.truth) / data.truth) * 100
+gpt_mape = np.nanmedian(np.abs(data.gpt - data.truth) / data.truth) * 100
+print(f'heuristic MAPE: {hr_mape}')
+print(f'GPT MAPE: {gpt_mape}\n')
+
+hr_mae = np.nanmedian(np.abs(data.heuristic - data.truth))
+gpt_mae = np.nanmedian(np.abs(data.gpt - data.truth))
+print(f'heuristic MAE: {hr_mae}')
+print(f'GPT MAE: {gpt_mae}\n')
+
+hr_correct = len(data[data.heuristic == data.truth]) / len(data) * 100
+gpt_correct = len(data[data.gpt == data.truth]) / len(data) * 100
+print(f'heuristic was correct in {hr_correct} % of evaluation papers')
+print(f'GPT was correct in {gpt_correct} % of evaluation papers\n')
+
+median_heuristic = data["heuristic"].median()
+median_gpt = data["gpt"].median()
+median_truth = data["truth"].median()
+print(f"Median of heuristic counts: {median_heuristic}")
+print(f"Median of GPT-3 counts: {median_gpt}")
+print(f"Median of ground truth counts: {median_truth}")
+
+# create figure
+fig, ax = plt.subplots(figsize=(5, 5))
+sns.scatterplot(
+    data=data,
+    x="heuristic",
+    y="truth",
+    ax=ax,
+    label="heuristic",
+    alpha=.7,
+)
+
+
+sns.scatterplot(
+    data=data,
+    x="gpt",
+    y="truth",
+    ax=ax,
+    label="GPT-3",
+    alpha=.7,
+)
+ax.scatter(
+    median_heuristic,
+    median_truth,
+    color='tab:blue',
+    marker='*',
+    s=200,
+    label="Median of heuristic counts",
+    alpha=.7,
+    edgecolors='k'
+)
+ax.scatter(
+    median_gpt,
+    median_truth,
+    color='tab:orange',
+    marker='*',
+    s=200,
+    label="Median of GPT-3 counts",
+    alpha=.7,
+    edgecolors='k'
+)
+
+ax.set_xlabel(
+    "Participant count\n(automatically extracted by heuristic or GPT-3)"
+)
+ax.set_ylabel("Participant count\n(manually-labelled ground truth)")
+ax.set_yscale('log')
+ax.set_xscale('log')
+ax.set_aspect('equal')
+ax.plot([0.9, 1000], [0.9, 1000], color='k', alpha=.2, linestyle='--')
+ax.set_ylim([0.9, 1000])
+ax.set_xlim([0.9, 1000])
+ax.set_title("All studies")
+sns.despine()
+plt.legend(loc="lower right")
+
+fig.tight_layout()
+fig.savefig(
+    utils.get_figures_dir()
+    / "scatterplot_truth-vs-gpt-and-heuristic.pdf"
+)
diff --git a/scripts/utils.py b/scripts/utils.py
@@ -119,3 +119,26 @@ def load_neurosynth_sample_sizes() -> pd.DataFrame:
 
 def load_david_sample_sizes() -> pd.DataFrame:
     return _load_scanning_horizon_sample_sizes("david_sampsizedata.txt")
+
+
+def load_gpt_sample_sizes() -> pd.DataFrame:
+    filepath = (
+        get_outputs_dir()
+        / "all_documents_participant_demographics_gpt_tokens-4000_clean.csv"
+    )
+    data = pd.read_csv(filepath)
+    # add the dates
+    metadata_path = (
+        pathlib.Path(__file__).resolve().parents[1] / "data" / "metadata.csv"
+    )
+    metadata = pd.read_csv(metadata_path)
+
+    for ind, row in data.iterrows():
+        pmcid = row["pmcid"]
+        year = metadata[metadata["pmcid"] == pmcid]["publication_year"].values[0]
+        data.loc[ind, "publication_year"] = int(year)
+
+    data["publication_year"] = pd.to_datetime(
+        pd.DataFrame({"year": data["publication_year"], "month": 1, "day": 1})
+    )
+    return data