Test submissions processor

s2t2 · Dec 14, 2023 · 1ed5071 · 1ed5071
1 parent 2700287
commit 1ed5071
Show file tree

Hide file tree

Showing 5 changed files with 136 additions and 15 deletions.
diff --git a/app/submissions_processor.py b/app/submissions_processor.py
@@ -14,17 +14,22 @@
 
 class SubmissionsProcessor:
 
-    def __init__(self, dirpath=SUBMISSIONS_DIRPATH, results_dirpath=RESULTS_DIRPATH):
+    def __init__(self, dirpath=SUBMISSIONS_DIRPATH, starter_filename=None, results_dirpath=RESULTS_DIRPATH):
+        """Can use a starter file, or not."""
 
         self.submissions_dirpath = dirpath
-        self.sm = SubmissionsManager(self.submissions_dirpath)
-        print(self.sm.dirpath)
-        print(len(self.sm.filenames))
+        self.starter_filename = starter_filename
 
         self.results_dirpath = results_dirpath or self.submissions_dirpath
         self.notebooks_csv_filepath = os.path.join(self.results_dirpath, "notebooks.csv")
         self.cells_csv_filepath = os.path.join(self.results_dirpath, "cells.csv")
 
+        # get all submision files (consider passing them in for a looser coupling with the manager class):
+        self.sm = SubmissionsManager(self.submissions_dirpath, starter_filename=self.starter_filename)
+        print("SUBMISSIONS DIR:", self.sm.dirpath)
+        print("FILES:", len(self.sm.filenames))
+        self.submission_filepaths = self.sm.filepaths
+
         # available post processing:
         self.starter_dp = None
         self.notebooks_df = None
@@ -36,12 +41,9 @@ def perform(self):
             Produces a CSV file of document statistics, as well as a CSV file of cell contents and metadata.
         """
 
-        starter_filepath = self.sm.find_filepath("STARTER")
-        self.starter_dp = DocumentProcessor(starter_filepath)
-
         all_cells = []
         records = []
-        for filepath in self.sm.filepaths:
+        for filepath in self.submission_filepaths:
             dp = DocumentProcessor(filepath, verbose=False)
             avg_lengths = dp.cells_df.groupby("cell_type")["cell_length"].mean()
             record = {
@@ -77,9 +79,15 @@ def perform(self):
         print("------")
         print("STARTER CELLS:") # (~30% of cells are the same as starter cells)
         #starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ]
-        starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_dp.filename ]
-        self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter'))
-        self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True)
+        if self.starter_filename:
+            starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_filename ]
+            self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter'))
+            self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True)
+            #self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna()
+            #print(self.cells_df["starter_content"].value_counts())
+        else:
+            self.cells_df["starter_cell_id"] = None
+            #self.cells_df["starter_content"] = False
         self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna()
         print(self.cells_df["starter_content"].value_counts())
 
@@ -106,7 +114,7 @@ def plot_documents(self, fig_show=FIG_SHOW):
         #chart_df["filename"] = chart_df.index
         avg_length = chart_df.groupby('filename')['length'].mean().mean()
         title = "Document Lengths (All Content)"
-        title += f"<br><sup>Documents: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars</sup>"
+        title += f"<br><sup>Documents: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
         fig = px.violin(chart_df, x="length", box=True, points="all", height=400, title=title,
                 hover_data=["file_id", "filename"] # "file_id",
         )
@@ -127,7 +135,7 @@ def plot_documents(self, fig_show=FIG_SHOW):
         chart_pivot.reset_index(inplace=True) # convert multi-index to columns, https://stackoverflow.com/a/25733562/670433
         avg_length = chart_pivot['length'].mean()
         title = "Document Lengths (Unique Content Only)"
-        title += f"<br><sup>Documents: {len(chart_pivot)} | Avg Length: {avg_length:,.0f} chars</sup>"
+        title += f"<br><sup>Documents: {len(chart_pivot):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
         fig = px.violin(chart_pivot, x="length", box=True, points="all", height=400, title=title,
                 hover_data=["file_id", "filename"]
         )
@@ -143,7 +151,7 @@ def plot_cells(self, fig_show=FIG_SHOW):
         chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K
         avg_length = chart_df["cell_length"].mean()
         title = "Cell Lengths (All Content)"
-        title += f"<br><sup>Cells: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars</sup>"
+        title += f"<br><sup>Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
         fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title,
                 hover_data=["page_content"], facet_row="cell_type",
                 color="cell_type", color_discrete_map=CELL_COLORS_MAP
@@ -160,7 +168,7 @@ def plot_cells(self, fig_show=FIG_SHOW):
         chart_df = chart_df[chart_df["is_empty"] == False]
         avg_length = chart_df["cell_length"].mean()
         title = "Cell Lengths (Unique Content Only)"
-        title += f"<br><sup>Cells: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars</sup>"
+        title += f"<br><sup>Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars</sup>"
         fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title,
                 hover_data=["page_content"], facet_row="cell_type",
                 color="cell_type", color_discrete_map=CELL_COLORS_MAP

diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,7 @@
+
+
+
+import os
+
+TEST_DOCS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "documents")
+TEST_RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "results")