Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

code for litmining paper lineplot and scatterplot for n participants #3

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions scripts/lineplot_n-over-time_diff-sources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Plot median number of participants through time from multiple sources.

The plot shows the median for each year from 3 sources:
- David & al annotations distributed in https://github.com/poldracklab/ScanningTheHorizon
- Annotations of NeuroSynth abstracts distributed in https://github.com/poldracklab/ScanningTheHorizon
- Sample sizes automatically extracted from pubget data.
"""
from pathlib import Path

import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

import utils

MIN_PAPERS = 20
NS_NAME = "Poldrack & al. [2017]"
DAVID_NAME = "David & al. [2013]"
PUBGET_NAME = "pubextract heuristic [in 2023]"
GPT_NAME = "GPT-3 [in 2023]"

np.random.seed(0)

demographics_file = (
Path(__file__).resolve().parents[1]
/ "data"
/ "outputs"
/ "n_participants_full_dataset.csv"
)
demographics_data = pd.read_csv(demographics_file)
# restrict to single-group studies
demographics_data = demographics_data[demographics_data["n_groups"] == 1]
demographics_data = demographics_data.loc[:, ["publication_year", "count"]]
demographics_data["Data source"] = PUBGET_NAME
demographics_data["publication_year"] = pd.to_datetime(
pd.DataFrame({"year": demographics_data["publication_year"], "month": 1, "day": 1})
)
year_counts = demographics_data["publication_year"].value_counts()
years_with_too_few = year_counts[year_counts < MIN_PAPERS].index.values
demographics_data = demographics_data[~demographics_data["publication_year"].isin(years_with_too_few)]

neurosynth_data = utils.load_neurosynth_sample_sizes().loc[
:, ["publication_year", "count"]
]
neurosynth_data["Data source"] = NS_NAME

david_data = utils.load_david_sample_sizes().loc[
:, ["publication_year", "count"]
]
david_data["Data source"] = DAVID_NAME

gpt_data = utils.load_gpt_sample_sizes().loc[
:, ["publication_year", "count", "pmcid"]
]
# restrict to single-group studies
for pmcid, group in gpt_data.groupby("pmcid"):
gpt_data.loc[group.index, "n_groups"] = len(group)
gpt_data = gpt_data[gpt_data["n_groups"] == 1]
gpt_data = gpt_data.loc[:, ["publication_year", "count"]]
gpt_data["Data source"] = GPT_NAME
# get rid of years with too few papers
year_counts = gpt_data["publication_year"].value_counts()
years_with_too_few = year_counts[year_counts < MIN_PAPERS].index.values
gpt_data = gpt_data[~gpt_data["publication_year"].isin(years_with_too_few)]

data = pd.concat(
[
demographics_data,
neurosynth_data,
david_data,
gpt_data
], axis=0, ignore_index=True
)


fig, ax = plt.subplots(figsize=(9, 5))
ax.grid(which='major', axis='y', color='gray', alpha=.3)
percentile = 50
sns.lineplot(
data=data,
x="publication_year",
y="count",
hue="Data source",
hue_order=(DAVID_NAME, NS_NAME, PUBGET_NAME, GPT_NAME),
palette=np.asarray(utils.TAB10_COLORS[:4])[[2, 0, 1, 3]],
style="Data source",
estimator=lambda x: np.percentile(x, percentile),
ax=ax,
)
ax.set_xlabel("Publication year")
ax.set_ylabel("Median sample size")
ax.set_ylim(0, 40)
ax.set_xlim(data["publication_year"].min(), data["publication_year"].max())
# ax.legend(loc="upper left", frameon=False)

sns.move_legend(ax, "upper left")

sns.despine()

output_file = utils.get_figures_dir() / "lineplot_n-over-time_diff-sources.pdf"
fig.savefig(output_file, bbox_inches="tight")
139 changes: 139 additions & 0 deletions scripts/scatterplot_truth-vs-gpt-and-heuristic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

import utils


np.random.seed(0)

# heuristic data
demographics_file = utils.get_outputs_dir() / "n_participants_full_dataset.csv"
heuristic_data = pd.read_csv(demographics_file).set_index("pmcid")
heuristic_data["heuristic"] = heuristic_data["count"]

# gpt data
gpt_file = (
utils.get_outputs_dir()
# / "all_documents_participant_demographics_gpt_tokens-4000_clean.csv"
/ "eval_participant_demographics_gpt_tokens-2000_clean.csv"
)
gpt_data_original = pd.read_csv(gpt_file)
gpt_data = pd.DataFrame(index=gpt_data_original["pmcid"].unique())
# calculate total count for multi-group studies
for pmcid, group in gpt_data_original.groupby("pmcid"):
gpt_data.loc[pmcid, "gpt"] = group["count"].sum()
gpt_data.loc[pmcid, "n_groups"] = len(group)

# ground truth
truth_file = utils.get_outputs_dir() / "evaluation_labels.csv"
truth_data_original = pd.read_csv(truth_file)
truth_data = pd.DataFrame(index=truth_data_original["pmcid"].unique())
for pmcid, group in truth_data_original.groupby("pmcid"):
truth_data.loc[pmcid, "truth"] = group["count"].sum()
truth_data.loc[pmcid, "n_groups"] = len(group)

# concatenate data to compare results
data_full = pd.concat(
[
heuristic_data,
gpt_data,
truth_data,
],
axis=1,
join="outer",
)
# data_full.to_csv('TEMP.csv')

data = data_full.loc[:, ["heuristic", "gpt", "truth"]]
data = data[data.index.isin(truth_data.index)]

# evaluate results
hr_guessed = len(data[data["heuristic"] > -1])/len(data) * 100
gpt_guessed = len(data[data["gpt"] > -1])/len(data) * 100
print(f'\nheuristic made a guess in {hr_guessed} % of evaluation papers')
print(f'GPT made a guess in {gpt_guessed} % of evaluation papers\n')

hr_mape = np.nanmedian(np.abs(data.heuristic - data.truth) / data.truth) * 100
gpt_mape = np.nanmedian(np.abs(data.gpt - data.truth) / data.truth) * 100
print(f'heuristic MAPE: {hr_mape}')
print(f'GPT MAPE: {gpt_mape}\n')

hr_mae = np.nanmedian(np.abs(data.heuristic - data.truth))
gpt_mae = np.nanmedian(np.abs(data.gpt - data.truth))
print(f'heuristic MAE: {hr_mae}')
print(f'GPT MAE: {gpt_mae}\n')

hr_correct = len(data[data.heuristic == data.truth]) / len(data) * 100
gpt_correct = len(data[data.gpt == data.truth]) / len(data) * 100
print(f'heuristic was correct in {hr_correct} % of evaluation papers')
print(f'GPT was correct in {gpt_correct} % of evaluation papers\n')

median_heuristic = data["heuristic"].median()
median_gpt = data["gpt"].median()
median_truth = data["truth"].median()
print(f"Median of heuristic counts: {median_heuristic}")
print(f"Median of GPT-3 counts: {median_gpt}")
print(f"Median of ground truth counts: {median_truth}")

# create figure
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
data=data,
x="heuristic",
y="truth",
ax=ax,
label="heuristic",
alpha=.7,
)


sns.scatterplot(
data=data,
x="gpt",
y="truth",
ax=ax,
label="GPT-3",
alpha=.7,
)
ax.scatter(
median_heuristic,
median_truth,
color='tab:blue',
marker='*',
s=200,
label="Median of heuristic counts",
alpha=.7,
edgecolors='k'
)
ax.scatter(
median_gpt,
median_truth,
color='tab:orange',
marker='*',
s=200,
label="Median of GPT-3 counts",
alpha=.7,
edgecolors='k'
)

ax.set_xlabel(
"Participant count\n(automatically extracted by heuristic or GPT-3)"
)
ax.set_ylabel("Participant count\n(manually-labelled ground truth)")
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_aspect('equal')
ax.plot([0.9, 1000], [0.9, 1000], color='k', alpha=.2, linestyle='--')
ax.set_ylim([0.9, 1000])
ax.set_xlim([0.9, 1000])
ax.set_title("All studies")
sns.despine()
plt.legend(loc="lower right")

fig.tight_layout()
fig.savefig(
utils.get_figures_dir()
/ "scatterplot_truth-vs-gpt-and-heuristic.pdf"
)
23 changes: 23 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,26 @@ def load_neurosynth_sample_sizes() -> pd.DataFrame:

def load_david_sample_sizes() -> pd.DataFrame:
return _load_scanning_horizon_sample_sizes("david_sampsizedata.txt")


def load_gpt_sample_sizes() -> pd.DataFrame:
filepath = (
get_outputs_dir()
/ "all_documents_participant_demographics_gpt_tokens-4000_clean.csv"
)
data = pd.read_csv(filepath)
# add the dates
metadata_path = (
pathlib.Path(__file__).resolve().parents[1] / "data" / "metadata.csv"
)
metadata = pd.read_csv(metadata_path)

for ind, row in data.iterrows():
pmcid = row["pmcid"]
year = metadata[metadata["pmcid"] == pmcid]["publication_year"].values[0]
data.loc[ind, "publication_year"] = int(year)

data["publication_year"] = pd.to_datetime(
pd.DataFrame({"year": data["publication_year"], "month": 1, "day": 1})
)
return data