Skip to content

Commit

Permalink
Test submissions processor
Browse files Browse the repository at this point in the history
  • Loading branch information
s2t2 committed Dec 14, 2023
1 parent 2700287 commit 1ed5071
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 15 deletions.
38 changes: 23 additions & 15 deletions app/submissions_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,22 @@

class SubmissionsProcessor:

def __init__(self, dirpath=SUBMISSIONS_DIRPATH, results_dirpath=RESULTS_DIRPATH):
def __init__(self, dirpath=SUBMISSIONS_DIRPATH, starter_filename=None, results_dirpath=RESULTS_DIRPATH):
"""Can use a starter file, or not."""

self.submissions_dirpath = dirpath
self.sm = SubmissionsManager(self.submissions_dirpath)
print(self.sm.dirpath)
print(len(self.sm.filenames))
self.starter_filename = starter_filename

self.results_dirpath = results_dirpath or self.submissions_dirpath
self.notebooks_csv_filepath = os.path.join(self.results_dirpath, "notebooks.csv")
self.cells_csv_filepath = os.path.join(self.results_dirpath, "cells.csv")

# get all submision files (consider passing them in for a looser coupling with the manager class):
self.sm = SubmissionsManager(self.submissions_dirpath, starter_filename=self.starter_filename)
print("SUBMISSIONS DIR:", self.sm.dirpath)
print("FILES:", len(self.sm.filenames))
self.submission_filepaths = self.sm.filepaths

# available post processing:
self.starter_dp = None
self.notebooks_df = None
Expand All @@ -36,12 +41,9 @@ def perform(self):
Produces a CSV file of document statistics, as well as a CSV file of cell contents and metadata.
"""

starter_filepath = self.sm.find_filepath("STARTER")
self.starter_dp = DocumentProcessor(starter_filepath)

all_cells = []
records = []
for filepath in self.sm.filepaths:
for filepath in self.submission_filepaths:
dp = DocumentProcessor(filepath, verbose=False)
avg_lengths = dp.cells_df.groupby("cell_type")["cell_length"].mean()
record = {
Expand Down Expand Up @@ -77,9 +79,15 @@ def perform(self):
print("------")
print("STARTER CELLS:") # (~30% of cells are the same as starter cells)
#starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ]
starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_dp.filename ]
self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter'))
self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True)
if self.starter_filename:
starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_filename ]
self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter'))
self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True)
#self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna()
#print(self.cells_df["starter_content"].value_counts())
else:
self.cells_df["starter_cell_id"] = None
#self.cells_df["starter_content"] = False
self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna()
print(self.cells_df["starter_content"].value_counts())

Expand All @@ -106,7 +114,7 @@ def plot_documents(self, fig_show=FIG_SHOW):
#chart_df["filename"] = chart_df.index
avg_length = chart_df.groupby('filename')['length'].mean().mean()
title = "Document Lengths (All Content)"
title += f"<br><sup>Documents: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars</sup>"
title += f"<br><sup>Documents: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
fig = px.violin(chart_df, x="length", box=True, points="all", height=400, title=title,
hover_data=["file_id", "filename"] # "file_id",
)
Expand All @@ -127,7 +135,7 @@ def plot_documents(self, fig_show=FIG_SHOW):
chart_pivot.reset_index(inplace=True) # convert multi-index to columns, https://stackoverflow.com/a/25733562/670433
avg_length = chart_pivot['length'].mean()
title = "Document Lengths (Unique Content Only)"
title += f"<br><sup>Documents: {len(chart_pivot)} | Avg Length: {avg_length:,.0f} chars</sup>"
title += f"<br><sup>Documents: {len(chart_pivot):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
fig = px.violin(chart_pivot, x="length", box=True, points="all", height=400, title=title,
hover_data=["file_id", "filename"]
)
Expand All @@ -143,7 +151,7 @@ def plot_cells(self, fig_show=FIG_SHOW):
chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K
avg_length = chart_df["cell_length"].mean()
title = "Cell Lengths (All Content)"
title += f"<br><sup>Cells: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars</sup>"
title += f"<br><sup>Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title,
hover_data=["page_content"], facet_row="cell_type",
color="cell_type", color_discrete_map=CELL_COLORS_MAP
Expand All @@ -160,7 +168,7 @@ def plot_cells(self, fig_show=FIG_SHOW):
chart_df = chart_df[chart_df["is_empty"] == False]
avg_length = chart_df["cell_length"].mean()
title = "Cell Lengths (Unique Content Only)"
title += f"<br><sup>Cells: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars</sup>"
title += f"<br><sup>Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title,
hover_data=["page_content"], facet_row="cell_type",
color="cell_type", color_discrete_map=CELL_COLORS_MAP
Expand Down
7 changes: 7 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@



import os

TEST_DOCS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "documents")
TEST_RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "results")
Loading

0 comments on commit 1ed5071

Please sign in to comment.