diff --git a/app/submissions_processor.py b/app/submissions_processor.py index b6031d3..ab5c9ae 100644 --- a/app/submissions_processor.py +++ b/app/submissions_processor.py @@ -14,17 +14,22 @@ class SubmissionsProcessor: - def __init__(self, dirpath=SUBMISSIONS_DIRPATH, results_dirpath=RESULTS_DIRPATH): + def __init__(self, dirpath=SUBMISSIONS_DIRPATH, starter_filename=None, results_dirpath=RESULTS_DIRPATH): + """Can use a starter file, or not.""" self.submissions_dirpath = dirpath - self.sm = SubmissionsManager(self.submissions_dirpath) - print(self.sm.dirpath) - print(len(self.sm.filenames)) + self.starter_filename = starter_filename self.results_dirpath = results_dirpath or self.submissions_dirpath self.notebooks_csv_filepath = os.path.join(self.results_dirpath, "notebooks.csv") self.cells_csv_filepath = os.path.join(self.results_dirpath, "cells.csv") + # get all submision files (consider passing them in for a looser coupling with the manager class): + self.sm = SubmissionsManager(self.submissions_dirpath, starter_filename=self.starter_filename) + print("SUBMISSIONS DIR:", self.sm.dirpath) + print("FILES:", len(self.sm.filenames)) + self.submission_filepaths = self.sm.filepaths + # available post processing: self.starter_dp = None self.notebooks_df = None @@ -36,12 +41,9 @@ def perform(self): Produces a CSV file of document statistics, as well as a CSV file of cell contents and metadata. """ - starter_filepath = self.sm.find_filepath("STARTER") - self.starter_dp = DocumentProcessor(starter_filepath) - all_cells = [] records = [] - for filepath in self.sm.filepaths: + for filepath in self.submission_filepaths: dp = DocumentProcessor(filepath, verbose=False) avg_lengths = dp.cells_df.groupby("cell_type")["cell_length"].mean() record = { @@ -77,9 +79,15 @@ def perform(self): print("------") print("STARTER CELLS:") # (~30% of cells are the same as starter cells) #starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ] - starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_dp.filename ] - self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter')) - self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True) + if self.starter_filename: + starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_filename ] + self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter')) + self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True) + #self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna() + #print(self.cells_df["starter_content"].value_counts()) + else: + self.cells_df["starter_cell_id"] = None + #self.cells_df["starter_content"] = False self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna() print(self.cells_df["starter_content"].value_counts()) @@ -106,7 +114,7 @@ def plot_documents(self, fig_show=FIG_SHOW): #chart_df["filename"] = chart_df.index avg_length = chart_df.groupby('filename')['length'].mean().mean() title = "Document Lengths (All Content)" - title += f"
Documents: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars" + title += f"
Documents: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars" fig = px.violin(chart_df, x="length", box=True, points="all", height=400, title=title, hover_data=["file_id", "filename"] # "file_id", ) @@ -127,7 +135,7 @@ def plot_documents(self, fig_show=FIG_SHOW): chart_pivot.reset_index(inplace=True) # convert multi-index to columns, https://stackoverflow.com/a/25733562/670433 avg_length = chart_pivot['length'].mean() title = "Document Lengths (Unique Content Only)" - title += f"
Documents: {len(chart_pivot)} | Avg Length: {avg_length:,.0f} chars" + title += f"
Documents: {len(chart_pivot):,.0f} | Avg Length: {avg_length:,.0f} chars" fig = px.violin(chart_pivot, x="length", box=True, points="all", height=400, title=title, hover_data=["file_id", "filename"] ) @@ -143,7 +151,7 @@ def plot_cells(self, fig_show=FIG_SHOW): chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K avg_length = chart_df["cell_length"].mean() title = "Cell Lengths (All Content)" - title += f"
Cells: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars" + title += f"
Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars" fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title, hover_data=["page_content"], facet_row="cell_type", color="cell_type", color_discrete_map=CELL_COLORS_MAP @@ -160,7 +168,7 @@ def plot_cells(self, fig_show=FIG_SHOW): chart_df = chart_df[chart_df["is_empty"] == False] avg_length = chart_df["cell_length"].mean() title = "Cell Lengths (Unique Content Only)" - title += f"
Cells: {len(chart_df:,.0f)} | Avg Length: {avg_length:,.0f} chars" + title += f"
Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars" fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title, hover_data=["page_content"], facet_row="cell_type", color="cell_type", color_discrete_map=CELL_COLORS_MAP diff --git a/conftest.py b/conftest.py index e69de29..2614388 100644 --- a/conftest.py +++ b/conftest.py @@ -0,0 +1,7 @@ + + + +import os + +TEST_DOCS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "documents") +TEST_RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "results") diff --git a/test/results/cells.csv b/test/results/cells.csv new file mode 100644 index 0000000..ad542a4 --- /dev/null +++ b/test/results/cells.csv @@ -0,0 +1,34 @@ +file_id,filename,cell_id,cell_length,cell_type,is_empty,page_content,dup_content,starter_cell_id,starter_content +the,Making_the_Most_of_your_Colab_Subscription.ipynb,1,71,TEXT,False,"'markdown' cell: '['# Making the Most of your Colab Subscription', '']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,2,798,TEXT,False,"'markdown' cell: '['## Faster GPUs', '', ""Users who have purchased one of Colab's paid plans have access to premium GPUs. You can upgrade your notebook's GPU settings in `Runtime > Change runtime type` in the menu to enable Premium accelerator. Subject to availability, selecting a premium GPU may grant you access to a V100 or A100 Nvidia GPU."", '', ""The free of charge version of Colab grants access to Nvidia's T4 GPUs subject to quota restrictions and availability."", '', 'You can see what GPU you\'ve been assigned at any time by executing the following cell. If the execution result of running the code cell below is ""Not connected to a GPU"", you can change the runtime by going to `Runtime > Change runtime type` in the menu to enable a GPU accelerator, and then re-execute the code cell.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,3,250,CODE,False,"'code' cell: '['gpu_info = !nvidia-smi', ""gpu_info = '\\n'.join(gpu_info)"", ""if gpu_info.find('failed') >= 0:"", "" print('Not connected to a GPU')"", 'else:', ' print(gpu_info)']' with output: '['/bin/bash: line 1: nvidia-smi: command not found\n']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,4,168,TEXT,False,"'markdown' cell: '['In order to use a GPU with your notebook, select the `Runtime > Change runtime type` menu, and then set the hardware accelerator dropdown to GPU.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,5,524,TEXT,False,"'markdown' cell: '['## More memory', '', ""Users who have purchased one of Colab's paid plans have access to high-memory VMs when they are available."", '', '', '', 'You can see how much memory you have available at any time by running the following code cell. If the execution result of running the code cell below is ""Not using a high-RAM runtime"", then you can enable a high-RAM runtime via `Runtime > Change runtime type` in the menu. Then select High-RAM in the Runtime shape dropdown. After, re-execute the code cell.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,6,408,CODE,False,"'code' cell: '['from psutil import virtual_memory', 'ram_gb = virtual_memory().total / 1e9', ""print('Your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))"", '', 'if ram_gb < 20:', "" print('Not using a high-RAM runtime')"", 'else:', "" print('You are using a high-RAM runtime!')""]' with output: '['Your runtime has 13.6 gigabytes of available RAM\n', '\n', 'Not using a high-RAM runtime\n']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,7,516,TEXT,False,"'markdown' cell: '['## Longer runtimes', '', ""All Colab runtimes are reset after some period of time (which is faster if the runtime isn't executing code). Colab Pro and Pro+ users have access to longer runtimes than those who use Colab free of charge."", '', '## Background execution', '', ""Colab Pro+ users have access to background execution, where notebooks will continue executing even after you've closed a browser tab. This is always enabled in Pro+ runtimes as long as you have compute units available."", '']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,8,521,TEXT,False,"'markdown' cell: '['## Relaxing resource limits in Colab Pro', '', ""Your resources are not unlimited in Colab. To make the most of Colab, avoid using resources when you don't need them. For example, only use a GPU when required and close Colab tabs when finished."", '', '', '', 'If you encounter limitations, you can relax those limitations by purchasing more compute units via Pay As You Go. Anyone can purchase compute units via [Pay As You Go](https://colab.research.google.com/signup); no subscription is required.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,9,438,TEXT,False,"'markdown' cell: '['## Send us feedback!', '', ""If you have any feedback for us, please let us know. The best way to send feedback is by using the Help > 'Send feedback...' menu. If you encounter usage limits in Colab Pro consider subscribing to Pro+."", '', 'If you encounter errors or other issues with billing (payments) for Colab Pro, Pro+, or Pay As You Go, please email [colab-billing@google.com](mailto:colab-billing@google.com).']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,10,1600,TEXT,False,"'markdown' cell: '['## More Resources', '', '### Working with Notebooks in Colab', '- [Overview of Colaboratory](/notebooks/basic_features_overview.ipynb)', '- [Guide to Markdown](/notebooks/markdown_guide.ipynb)', '- [Importing libraries and installing dependencies](/notebooks/snippets/importing_libraries.ipynb)', '- [Saving and loading notebooks in GitHub](https://colab.research.google.com/github/googlecolab/colabtools/blob/main/notebooks/colab-github-demo.ipynb)', '- [Interactive forms](/notebooks/forms.ipynb)', '- [Interactive widgets](/notebooks/widgets.ipynb)', '', '', '### Working with Data', '- [Loading data: Drive, Sheets, and Google Cloud Storage](/notebooks/io.ipynb)', '- [Charts: visualizing data](/notebooks/charts.ipynb)', '- [Getting started with BigQuery](/notebooks/bigquery.ipynb)', '', '### Machine Learning Crash Course', ""These are a few of the notebooks from Google's online Machine Learning course. See the [full course website](https://developers.google.com/machine-learning/crash-course/) for more."", '- [Intro to Pandas DataFrame](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/cc/exercises/pandas_dataframe_ultraquick_tutorial.ipynb)', '- [Linear regression with tf.keras using synthetic data](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/cc/exercises/linear_regression_with_synthetic_data.ipynb)', '', '', '', '### Using Accelerated Hardware', '- [TensorFlow with GPUs](/notebooks/gpu.ipynb)', '- [TensorFlow with TPUs](/notebooks/tpu.ipynb)']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,11,1171,TEXT,False,"'markdown' cell: '['', '', '## Machine Learning Examples', '', 'To see end-to-end examples of the interactive machine learning analyses that Colaboratory makes possible, check out these tutorials using models from [TensorFlow Hub](https://tfhub.dev).', '', 'A few featured examples:', '', '- [Retraining an Image Classifier](https://tensorflow.org/hub/tutorials/tf2_image_retraining): Build a Keras model on top of a pre-trained image classifier to distinguish flowers.', '- [Text Classification](https://tensorflow.org/hub/tutorials/tf2_text_classification): Classify IMDB movie reviews as either *positive* or *negative*.', '- [Style Transfer](https://tensorflow.org/hub/tutorials/tf2_arbitrary_image_stylization): Use deep learning to transfer style between images.', '- [Multilingual Universal Sentence Encoder Q&A](https://tensorflow.org/hub/tutorials/retrieval_with_tf_hub_universal_encoder_qa): Use a machine learning model to answer questions from the SQuAD dataset.', '- [Video Interpolation](https://tensorflow.org/hub/tutorials/tweening_conv3d): Predict what happened in a video between the first and the last frame.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,1,164,TEXT,False,"'markdown' cell: '['# Cells', 'A notebook is a list of cells. Cells contain either explanatory text or executable code and its output. Click a cell to select it.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,2,595,TEXT,False,"'markdown' cell: '['## Code cells', 'Below is a **code cell**. Once the toolbar button indicates CONNECTED, click in the cell to select it and execute the contents in the following ways:', '', '* Click the **Play icon** in the left gutter of the cell;', '* Type **Cmd/Ctrl+Enter** to run the cell in place;', '* Type **Shift+Enter** to run the cell and move focus to the next cell (adding one if none exists); or', '* Type **Alt+Enter** to run the cell and insert a new code cell immediately below it.', '', 'There are additional options for running some or all cells in the **Runtime** menu.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,3,497,TEXT,False,"'markdown' cell: '['## Text cells', 'This is a **text cell**. You can **double-click** to edit this cell. Text cells', 'use markdown syntax. To learn more, see our [markdown', 'guide](/notebooks/markdown_guide.ipynb).', '', 'You can also add math to text cells using [LaTeX](http://www.latex-project.org/)', 'to be rendered by [MathJax](https://www.mathjax.org). Just place the statement', 'within a pair of **\\$** signs. For example `$\\sqrt{3x-1}+(1+x)^2$` becomes', '$\\sqrt{3x-1}+(1+x)^2.$']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,4,706,TEXT,False,"'markdown' cell: '['## Adding and moving cells', 'You can add new cells by using the **+ CODE** and **+ TEXT** buttons that show when you hover between cells. These buttons are also in the toolbar above the notebook where they can be used to add a cell below the currently selected cell.', '', 'You can move a cell by selecting it and clicking **Cell Up** or **Cell Down** in the top toolbar.', '', 'Consecutive cells can be selected by ""lasso selection"" by dragging from outside one cell and through the group. Non-adjacent cells can be selected concurrently by clicking one and then holding down Ctrl while clicking another. Similarly, using Shift instead of Ctrl will select all intermediate cells.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,5,180,TEXT,False,"'markdown' cell: '['# Working with python', 'Colaboratory is built on top of [Jupyter Notebook](https://jupyter.org/). Below are some examples of convenience functions provided.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,6,185,TEXT,False,'markdown' cell: '['Long running python processes can be interrupted. Run the following cell and select **Runtime -> Interrupt execution** (*hotkey: Cmd/Ctrl-M I*) to stop execution.']',False,,False +of,Overview_of_Colaboratory_Features.ipynb,7,161,CODE,False,"'code' cell: '['import time', 'print(""Sleeping"")', 'time.sleep(30) # sleep for a while; interrupt me!', 'print(""Done Sleeping"")']' with output: '['Sleeping\n']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,8,109,TEXT,False,"'markdown' cell: '['## System aliases', '', 'Jupyter includes shortcuts for common operations, such as ls:']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,9,369,CODE,False,"'code' cell: '['!ls /bin']' with output: '[""'['\t\t\t\t mknod\n"", ' 7z\t\t\t\t mktemp\n', ' 7za\t\t\t\t mm2gv\n', ' 7zr\t\t\t\t more\n', ' aclocal\t\t\t mount\n', ' aclocal-1.16\t\t\t mountpoint\n', ' acyclic\t\t\t mpexpand\n', ' add-apt-repository\t\t mpic++\n', ' addpart\t\t\t mpicc\n', ' addr2line\t\t\t mpiCC\n']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,10,459,TEXT,False,"'markdown' cell: '['That `!ls` probably generated a large output. You can select the cell and clear the output by either:', '', '1. Clicking on the clear output button (x) in the toolbar above the cell; or', '2. Right clicking the left gutter of the output area and selecting ""Clear output"" from the context menu.', '', 'Execute any other process using `!` with string interpolation from python variables, and note the result can be assigned to a variable:']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,11,309,TEXT,False,"'markdown' cell: '['## Magics', ""Colaboratory shares the notion of magics from Jupyter. There are shorthand annotations that change how a cell's text is executed. To learn more, see [Jupyter's magics page](http://nbviewer.jupyter.org/github/ipython/ipython/blob/1.x/examples/notebooks/Cell%20Magics.ipynb).""]'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,12,300,TEXT,False,"'markdown' cell: '['## Automatic completions and exploring code', '', 'Colab provides automatic completions to explore attributes of Python objects, as well as to quickly view documentation strings. As an example, first run the following cell to import the [`numpy`](http://www.numpy.org) module.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,13,37,CODE,False,'code' cell: '['import numpy as np']',False,,False +of,Overview_of_Colaboratory_Features.ipynb,14,218,TEXT,False,"'markdown' cell: '['If you now insert your cursor after `np` and press **Period**(`.`), you will see the list of available completions within the `np` module. Completions can be opened again by using **Ctrl+Space**.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,15,148,TEXT,False,"'markdown' cell: '['If you type an open parenthesis after any function or class in the module, you will see a pop-up of its documentation string:']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,16,359,TEXT,False,"'markdown' cell: '['The documentation can be opened again using **Ctrl+Shift+Space** or you can view the documentation for method by mouse hovering over the method name.', '', 'When hovering over the method name the `Open in tab` link will open the documentation in a persistent pane. The `View source` link will navigate to the source code for the method.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,17,46,TEXT,False,'markdown' cell: '['## Exception Formatting']',False,,False +of,Overview_of_Colaboratory_Features.ipynb,18,72,TEXT,False,'markdown' cell: '['Exceptions are formatted nicely in Colab outputs:']',False,,False +of,Overview_of_Colaboratory_Features.ipynb,19,109,CODE,False,"'code' cell: '['x = 1', 'y = 4', 'z = y/(1-x)']' , gives error 'ZeroDivisionError',with description 'ignored'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,20,165,TEXT,False,"'markdown' cell: '['## Rich, interactive outputs', 'Until now all of the generated outputs have been text, but they can be more interesting, like the chart below.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,21,613,TEXT,False,"'markdown' cell: '['# Integration with Drive', '', 'Colaboratory is integrated with Google Drive. It allows you to share, comment, and collaborate on the same document with multiple people:', '', '* The **SHARE** button (top-right of the toolbar) allows you to share the notebook and control permissions set on it.', '', '* **File->Make a Copy** creates a copy of the notebook in Drive.', '', ""* **File->Save** saves the File to Drive. **File->Save and checkpoint** pins the version so it doesn't get deleted from the revision history."", '', ""* **File->Revision history** shows the notebook's revision history.""]'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,22,1015,TEXT,False,"'markdown' cell: '['## Commenting on a cell', 'You can comment on a Colaboratory notebook like you would on a Google Document. Comments are attached to cells, and are displayed next to the cell they refer to. If you have **comment-only** permissions, you will see a comment button on the top right of the cell when you hover over it.', '', 'If you have edit or comment permissions you can comment on a cell in one of three ways:', '', '1. Select a cell and click the comment button in the toolbar above the top-right corner of the cell.', '1. Right click a text cell and select **Add a comment** from the context menu.', '3. Use the shortcut **Ctrl+Shift+M** to add a comment to the currently selected cell.', '', 'You can resolve and reply to comments, and you can target comments to specific collaborators by typing *+[email address]* (e.g., `+user@domain.com`). Addressed collaborators will be emailed.', '', 'The Comment button in the top-right corner of the page shows all comments attached to the notebook.']'",False,,False diff --git a/test/results/notebooks.csv b/test/results/notebooks.csv new file mode 100644 index 0000000..da77fd6 --- /dev/null +++ b/test/results/notebooks.csv @@ -0,0 +1,3 @@ +filename,file_id,length,cells,code_cells,text_cells,avg_code_cell_length,avg_text_cell_length +Making_the_Most_of_your_Colab_Subscription.ipynb,the,6497,11,2,9,329.0,645.2 +Overview_of_Colaboratory_Features.ipynb,of,6888,22,4,18,169.0,341.1 diff --git a/test/submissions_processor_test.py b/test/submissions_processor_test.py new file mode 100644 index 0000000..f8cc7ed --- /dev/null +++ b/test/submissions_processor_test.py @@ -0,0 +1,69 @@ + +from pandas import DataFrame + + +from app.submissions_processor import SubmissionsProcessor + +from conftest import TEST_DOCS_DIRPATH, TEST_RESULTS_DIRPATH + + +EXPECTED_NOTEBOOK_RECORDS = [ + { + 'filename': 'Making_the_Most_of_your_Colab_Subscription.ipynb', + 'file_id': 'the', # default id not the best for the test files. it's ok. todo: revisit + 'length': 6497, + 'cells': 11, + 'code_cells': 2, + 'text_cells': 9, + 'avg_code_cell_length': 329.0, + 'avg_text_cell_length': 645.2, + }, + { + 'filename': 'Overview_of_Colaboratory_Features.ipynb', + 'file_id': 'of', # default id not the best for the test files. it's ok. todo: revisit + 'length': 6888, + 'cells': 22, + 'code_cells': 4, + 'text_cells': 18, + 'avg_code_cell_length': 169.0, + 'avg_text_cell_length': 341.1, + + } +] + +def test_submissions_processor(): + + sp = SubmissionsProcessor(dirpath=TEST_DOCS_DIRPATH, results_dirpath=TEST_RESULTS_DIRPATH, starter_filename="Overview_of_Colaboratory_Features.ipynb") + sp.perform() + + assert isinstance(sp.notebooks_df, DataFrame) + assert sp.notebooks_df.to_dict("records") == EXPECTED_NOTEBOOK_RECORDS + + assert isinstance(sp.cells_df, DataFrame) + assert len(sp.cells_df) == 33 + assert sp.cells_df.columns.tolist() == ['file_id', 'filename', 'cell_id', 'cell_length', 'cell_type', 'is_empty', 'page_content', 'dup_content', 'starter_cell_id', 'starter_content'] + assert sp.cells_df["is_empty"].sum() == 0 # there are no blank cells in the test notebooks + assert sp.cells_df["dup_content"].sum() == 0 # there are no overlapping cells in the test notebooks + assert sp.cells_df["starter_content"].sum() == 22 + + starter_cells = sp.cells_df[sp.cells_df["filename"] == sp.starter_filename] + other_cells = sp.cells_df[sp.cells_df["filename"] != sp.starter_filename] + assert len(starter_cells) == 22 + assert len(other_cells) == 11 + + + +def test_submissions_processor_without_starter(): + + sp = SubmissionsProcessor(dirpath=TEST_DOCS_DIRPATH, results_dirpath=TEST_RESULTS_DIRPATH, starter_filename=None) + sp.perform() + + assert isinstance(sp.notebooks_df, DataFrame) + assert sp.notebooks_df.to_dict("records") == EXPECTED_NOTEBOOK_RECORDS + + assert isinstance(sp.cells_df, DataFrame) + assert len(sp.cells_df) == 33 + assert sp.cells_df.columns.tolist() == ['file_id', 'filename', 'cell_id', 'cell_length', 'cell_type', 'is_empty', 'page_content', 'dup_content', 'starter_cell_id', 'starter_content'] + assert sp.cells_df["is_empty"].sum() == 0 # there are no blank cells in the test notebooks + assert sp.cells_df["dup_content"].sum() == 0 # there are no overlapping cells in the test notebooks + assert sp.cells_df["starter_content"].sum() == 0 # no starter if we don't want it