nestauk · RFOxbury · Nov 7, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -165,6 +165,8 @@ cython_debug/
 # input data
 data/*
 !data/*.md
+dsp_interview_transcripts/app/data/*
+!dsp_interview_transcripts/app/data/*.md
 # scrappy outputs from notebooks
 dsp_interview_transcripts/notebooks/outputs/*
 # output data, figures etc

diff --git a/dsp_interview_transcripts/app/app_pages.py b/dsp_interview_transcripts/app/app_pages.py
@@ -0,0 +1,71 @@
+import os
+
+import dash
+import dash_auth
+import dash_bootstrap_components as dbc
+
+from dash import dcc
+from dash import html
+from dash_bootstrap_templates import load_figure_template
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+# Initialize Dash app
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP], use_pages=True)
+
+auth = dash_auth.BasicAuth(app, {os.environ.get("VALID_USERNAME"): os.environ.get("VALID_PASSWORD")})
+
+load_figure_template("BOOTSTRAP")
+
+# Importing all the pages - this has to go AFTER app is initialised
+from pages import home
+from pages import overview
+from pages import scatterplot
+
+
+# Sidebar layout
+SIDEBAR_STYLE = {
+    "position": "fixed",
+    "top": 0,
+    "left": 0,
+    "bottom": 0,
+    "width": "16rem",
+    "padding": "2rem 1rem",
+    "background-color": "#f8f9fa",
+}
+
+CONTENT_STYLE = {
+    "margin-left": "18rem",
+    "margin-right": "2rem",
+    "padding": "2rem 1rem",
+}
+
+sidebar = html.Div(
+    [
+        html.H2("QualFML", className="display-5"),
+        html.Hr(),
+        dbc.Nav(
+            [
+                dbc.NavLink("Home", href="/", active="exact"),
+                dbc.NavLink("Topic overview", href="/overview", active="exact"),
+                # dbc.NavLink("Information by topic", href="/topic_info", active="exact"),
+                dbc.NavLink("User response mapping", href="/scatterplot", active="exact"),
+            ],
+            vertical=True,
+            pills=True,
+        ),
+    ],
+    style=SIDEBAR_STYLE,
+)
+
+# Main layout with sidebar and page content
+app.layout = html.Div([dcc.Location(id="url"), sidebar, dash.page_container])
+
+if __name__ == "__main__":
+    app.run_server(
+        # debug=True, # comment out when deploying to production
+        host="0.0.0.0",
+        port=8050,  # comment this part out when testing on your local machine & on public wifi
+    )
diff --git a/dsp_interview_transcripts/app/assets/style.css b/dsp_interview_transcripts/app/assets/style.css
@@ -0,0 +1,27 @@
+@import url("https://fonts.cdnfonts.com/css/century-gothic");
+html,
+body,
+[class*="css"] {
+  font-family: "Century Gothic";
+}
+:root {
+  --md-primary-fg-color: #18a48c;
+  --md-accent-fg-color: #eb003b;
+}
+
+/* Styling for the tab headers */
+.tab--selected {
+    background-color: #0000FF !important;
+    color: white !important;
+  }
+
+  .tab {
+    background-color: lightgray;  /* Tab background when not selected */
+    color: black;                 /* Tab text color when not selected */
+  }
+
+  /* Hover effect */
+  .tab:hover {
+    background-color: #0000FF;
+    color: white;
+  }
diff --git a/dsp_interview_transcripts/app/data.py b/dsp_interview_transcripts/app/data.py
@@ -0,0 +1,44 @@
+# from dsp_interview_transcripts.getters.final import get_summary_table
+# from dsp_interview_transcripts.getters.interim import get_data_w_topics
+# from dsp_interview_transcripts.getters.interim import get_rep_docs
+# from dsp_interview_transcripts.getters.interim import get_topic_names
+# from dsp_interview_transcripts.getters.raw import get_raw_transcripts_cleaned
+# import os
+
+import pandas as pd
+
+
+# from dotenv import load_dotenv
+
+
+# load_dotenv()
+
+# S3_BUCKET = os.environ.get("S3_BUCKET")
+
+rep_docs = pd.read_csv("data/user_messages_min_len_9_w_sentiment_topics_representative_docs.csv")
+data = pd.read_csv("data/user_messages_min_len_9_w_sentiment_topics.csv")
+data_w_names = pd.read_csv("data/user_messages_min_len_9_w_sentiment_topics_with_names_descriptions.csv")
+
+topic_counts = pd.DataFrame(data["Cluster"].value_counts()).reset_index()
+topic_counts = topic_counts.rename(columns={"count": "N responses in topic"})
+
+data_w_names = data_w_names.rename(columns={"llama3.2_name": "Name", "llama3.2_description": "Description"})
+data_w_names = pd.merge(data_w_names, topic_counts, left_on="Cluster", right_on="Cluster", how="left")
+
+data_viz = (
+    data.merge(data_w_names[["Cluster", "Name", "Description"]], on="Cluster", how="left")
+    .assign(Name=lambda df: df["Name"].fillna("None"))
+    .assign(Description=lambda df: df["Description"].fillna("None"))
+)
+
+names = data_viz["Name"].unique().tolist()
+
+# 0.2 for the noise cluster, otherwise 0.8
+data_viz["opacity"] = data_viz["Name"].apply(lambda Name: 0.8 if Name in names[1:] else 0.2)
+
+transcripts = pd.read_csv("data/qual_af_transcripts_cleaned.csv")
+
+summary_info = pd.read_csv("data/summary_info.csv")[
+    ["Name", "Description", "Top Words", "N responses in topic"]
+].drop_duplicates()
+#    'conversation', 'uuid', 'context','text_clean']]
diff --git a/dsp_interview_transcripts/app/data/README.md b/dsp_interview_transcripts/app/data/README.md
@@ -0,0 +1 @@
+Data needs to be downloaded from s3 and stored here, then read into the app.
diff --git a/dsp_interview_transcripts/app/pages/__init__.py b/dsp_interview_transcripts/app/pages/__init__.py
diff --git a/dsp_interview_transcripts/app/pages/home.py b/dsp_interview_transcripts/app/pages/home.py
@@ -0,0 +1,110 @@
+import dash
+
+from dash import dcc
+from dash import html
+
+
+dash.register_page(__name__, path="/")
+
+# Sidebar layout
+SIDEBAR_STYLE = {
+    "position": "fixed",
+    "top": 0,
+    "left": 0,
+    "bottom": 0,
+    "width": "16rem",
+    "padding": "2rem 1rem",
+    "background-color": "#f8f9fa",
+}
+
+CONTENT_STYLE = {
+    "margin-left": "18rem",
+    "margin-right": "2rem",
+    "padding": "2rem 1rem",
+}
+
+
+layout = html.Div(
+    [
+        html.H2("Welcome!", className="display-6"),
+        html.P("We have created this app to allow you to explore the data from the QualAF interview transcripts."),
+        html.H2("How to use this app", className="display-6"),
+        html.H3("Topic overview", className="display-8"),
+        dcc.Markdown(
+            """
+                     This tab gives you some high level information about the different topics.
+
+                     The plots at the top tell you:
+                     * the number of different user responses within each topic
+                     * the distribution of sentiments within each topic (most will be neutral). This was calculated by applying a
+                     sentiment analysis model to each response. See below for more information about this.
+                     * The number of different users who appear in each topic. Bear in mind that any one user may have said things that relate to multiple topics,
+                     which is why the greatest possible number of users in a topic is 100 as that is the total number of interviews, but the overall numbers in this plot add up to much more than 100!
+
+                     The table at the bottom lists:
+                     * The name of each topic (generated by llama3.2)
+                     * A description of the topic (also generated by llama3.2)
+                     * Key words in the topic - these occurred frequently in this topic, and less frequently in other topics
+                     * The number of user responses in the topic (the same information that you can see in the first bar chart)
+                     """
+        ),
+        html.H3("User response mapping", className="display-8"),
+        dcc.Markdown(
+            """
+                     This tab contains an interactive visualisation to help you explore user responses within each topic.
+
+                    The scatterplot shows each user response as a point. **The x and y axes are abstract and do not have any meaning**,
+                    though points that are closer together should be similar semantically and points that are further away are more different
+                    from each other semantically. Each point on the plot is coloured by topic.
+
+                    You can click topics in the legend to remove them from the plot. For example, the topic 'None' contains responses
+                    that could not be assigned to a topic, so you may choose not to display these.
+
+                    You can click a point to find out more information about it. On the left, you will see information about the topic it is in,
+                    the ID of the conversation it occurred in, and the response itself.
+
+                    When you click a point from the plot, the table at the bottom of the tab will display the full conversation between
+                    BOT and USER (*minus the preamble at the beginning, where the user is asked if they understand the instructions). The response
+                    you clicked in the plot will be highlighted in yellow.
+                     """
+        ),
+        html.H2("Methods", className="display-6"),
+        dcc.Markdown(
+            """
+                     ### Text preprocessing
+                     * We did some cleaning of the text, for example, grouping together responses that were sent immediately after one another.
+                     * Currently, **user responses shorter than 9 words are excluded from the topic modelling** and consequently are not shown in
+                     the scatterplot on the 'User response mapping' page. However, all user responses are still included in the table at the bottom of
+                     that page.
+
+                     ### Sentiment analysis
+                     We used the model [cardiffnlp/twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest)
+                     to predict the sentiment of each response. This model has been trained on lots of data (~124 million tweets) and uses the "knowledge" it
+                     has "learned" about that data to make predictions about new data that we show it. It categorises texts as "Positive", "Negative", or "Neutral".
+
+                     **We have not yet evaluated its performance
+                     on our data**, and so the sentiment predictions should be taken with a pinch of salt.
+
+                     ### Topic modelling
+                     We used the popular library [BERTopic](https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.get_representative_docs) to
+                     extract topics from the data.
+
+                     This method works by:
+                     * Embedding the text data - this means translating the text into a numerical representation (vectors) that a computer can understand.
+                     * Applying a clustering algorithm to the vectors. This works to group together similar vectors (and therefore similar text) into clusters.
+                     * Extracting the key words that distinguish each cluster from the others.
+
+                     We then also used an LLM (llama3.2) to give each cluster a name and description.
+                     """
+        ),
+        # html.Ul(
+        #     [
+        #         html.Li("We did some cleaning of the text, for example, grouping together responses that were sent immediately after one another."),
+        #         html.Li("Currently, "),
+        #         html.Li("User Response Mapping: Visualize user responses on a scatter plot."),
+        #         html.Li("Click on Points: Select specific points to view detailed user responses."),
+        #     ]
+        # ),
+    ],
+    style=CONTENT_STYLE,
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Data needs to be downloaded from s3 and stored here, then read into the app.