Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add minimal dash app #12

Draft
wants to merge 15 commits into
base: dev
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ cython_debug/
# input data
data/*
!data/*.md
dsp_interview_transcripts/app/data/*
!dsp_interview_transcripts/app/data/*.md
# scrappy outputs from notebooks
dsp_interview_transcripts/notebooks/outputs/*
# output data, figures etc
Expand Down
71 changes: 71 additions & 0 deletions dsp_interview_transcripts/app/app_pages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os

import dash
import dash_auth
import dash_bootstrap_components as dbc

from dash import dcc
from dash import html
from dash_bootstrap_templates import load_figure_template
from dotenv import load_dotenv


load_dotenv()

# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP], use_pages=True)

auth = dash_auth.BasicAuth(app, {os.environ.get("VALID_USERNAME"): os.environ.get("VALID_PASSWORD")})

load_figure_template("BOOTSTRAP")

# Importing all the pages - this has to go AFTER app is initialised
from pages import home
from pages import overview
from pages import scatterplot


# Sidebar layout
SIDEBAR_STYLE = {
"position": "fixed",
"top": 0,
"left": 0,
"bottom": 0,
"width": "16rem",
"padding": "2rem 1rem",
"background-color": "#f8f9fa",
}

CONTENT_STYLE = {
"margin-left": "18rem",
"margin-right": "2rem",
"padding": "2rem 1rem",
}

sidebar = html.Div(
[
html.H2("QualFML", className="display-5"),
html.Hr(),
dbc.Nav(
[
dbc.NavLink("Home", href="/", active="exact"),
dbc.NavLink("Topic overview", href="/overview", active="exact"),
# dbc.NavLink("Information by topic", href="/topic_info", active="exact"),
dbc.NavLink("User response mapping", href="/scatterplot", active="exact"),
],
vertical=True,
pills=True,
),
],
style=SIDEBAR_STYLE,
)

# Main layout with sidebar and page content
app.layout = html.Div([dcc.Location(id="url"), sidebar, dash.page_container])

if __name__ == "__main__":
app.run_server(
# debug=True, # comment out when deploying to production
host="0.0.0.0",
port=8050, # comment this part out when testing on your local machine & on public wifi
)
27 changes: 27 additions & 0 deletions dsp_interview_transcripts/app/assets/style.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
@import url("https://fonts.cdnfonts.com/css/century-gothic");
html,
body,
[class*="css"] {
font-family: "Century Gothic";
}
:root {
--md-primary-fg-color: #18a48c;
--md-accent-fg-color: #eb003b;
}

/* Styling for the tab headers */
.tab--selected {
background-color: #0000FF !important;
color: white !important;
}

.tab {
background-color: lightgray; /* Tab background when not selected */
color: black; /* Tab text color when not selected */
}

/* Hover effect */
.tab:hover {
background-color: #0000FF;
color: white;
}
44 changes: 44 additions & 0 deletions dsp_interview_transcripts/app/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# from dsp_interview_transcripts.getters.final import get_summary_table
# from dsp_interview_transcripts.getters.interim import get_data_w_topics
# from dsp_interview_transcripts.getters.interim import get_rep_docs
# from dsp_interview_transcripts.getters.interim import get_topic_names
# from dsp_interview_transcripts.getters.raw import get_raw_transcripts_cleaned
# import os

import pandas as pd


# from dotenv import load_dotenv


# load_dotenv()

# S3_BUCKET = os.environ.get("S3_BUCKET")

rep_docs = pd.read_csv("data/user_messages_min_len_9_w_sentiment_topics_representative_docs.csv")
data = pd.read_csv("data/user_messages_min_len_9_w_sentiment_topics.csv")
data_w_names = pd.read_csv("data/user_messages_min_len_9_w_sentiment_topics_with_names_descriptions.csv")

topic_counts = pd.DataFrame(data["Cluster"].value_counts()).reset_index()
topic_counts = topic_counts.rename(columns={"count": "N responses in topic"})

data_w_names = data_w_names.rename(columns={"llama3.2_name": "Name", "llama3.2_description": "Description"})
data_w_names = pd.merge(data_w_names, topic_counts, left_on="Cluster", right_on="Cluster", how="left")

data_viz = (
data.merge(data_w_names[["Cluster", "Name", "Description"]], on="Cluster", how="left")
.assign(Name=lambda df: df["Name"].fillna("None"))
.assign(Description=lambda df: df["Description"].fillna("None"))
)

names = data_viz["Name"].unique().tolist()

# 0.2 for the noise cluster, otherwise 0.8
data_viz["opacity"] = data_viz["Name"].apply(lambda Name: 0.8 if Name in names[1:] else 0.2)

transcripts = pd.read_csv("data/qual_af_transcripts_cleaned.csv")

summary_info = pd.read_csv("data/summary_info.csv")[
["Name", "Description", "Top Words", "N responses in topic"]
].drop_duplicates()
# 'conversation', 'uuid', 'context','text_clean']]
1 change: 1 addition & 0 deletions dsp_interview_transcripts/app/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Data needs to be downloaded from s3 and stored here, then read into the app.
Empty file.
110 changes: 110 additions & 0 deletions dsp_interview_transcripts/app/pages/home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import dash

from dash import dcc
from dash import html


dash.register_page(__name__, path="/")

# Sidebar layout
SIDEBAR_STYLE = {
"position": "fixed",
"top": 0,
"left": 0,
"bottom": 0,
"width": "16rem",
"padding": "2rem 1rem",
"background-color": "#f8f9fa",
}

CONTENT_STYLE = {
"margin-left": "18rem",
"margin-right": "2rem",
"padding": "2rem 1rem",
}


layout = html.Div(
[
html.H2("Welcome!", className="display-6"),
html.P("We have created this app to allow you to explore the data from the QualAF interview transcripts."),
html.H2("How to use this app", className="display-6"),
html.H3("Topic overview", className="display-8"),
dcc.Markdown(
"""
This tab gives you some high level information about the different topics.

The plots at the top tell you:
* the number of different user responses within each topic
* the distribution of sentiments within each topic (most will be neutral). This was calculated by applying a
sentiment analysis model to each response. See below for more information about this.
* The number of different users who appear in each topic. Bear in mind that any one user may have said things that relate to multiple topics,
which is why the greatest possible number of users in a topic is 100 as that is the total number of interviews, but the overall numbers in this plot add up to much more than 100!

The table at the bottom lists:
* The name of each topic (generated by llama3.2)
* A description of the topic (also generated by llama3.2)
* Key words in the topic - these occurred frequently in this topic, and less frequently in other topics
* The number of user responses in the topic (the same information that you can see in the first bar chart)
"""
),
html.H3("User response mapping", className="display-8"),
dcc.Markdown(
"""
This tab contains an interactive visualisation to help you explore user responses within each topic.

The scatterplot shows each user response as a point. **The x and y axes are abstract and do not have any meaning**,
though points that are closer together should be similar semantically and points that are further away are more different
from each other semantically. Each point on the plot is coloured by topic.

You can click topics in the legend to remove them from the plot. For example, the topic 'None' contains responses
that could not be assigned to a topic, so you may choose not to display these.

You can click a point to find out more information about it. On the left, you will see information about the topic it is in,
the ID of the conversation it occurred in, and the response itself.

When you click a point from the plot, the table at the bottom of the tab will display the full conversation between
BOT and USER (*minus the preamble at the beginning, where the user is asked if they understand the instructions). The response
you clicked in the plot will be highlighted in yellow.
"""
),
html.H2("Methods", className="display-6"),
dcc.Markdown(
"""
### Text preprocessing
* We did some cleaning of the text, for example, grouping together responses that were sent immediately after one another.
* Currently, **user responses shorter than 9 words are excluded from the topic modelling** and consequently are not shown in
the scatterplot on the 'User response mapping' page. However, all user responses are still included in the table at the bottom of
that page.

### Sentiment analysis
We used the model [cardiffnlp/twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest)
to predict the sentiment of each response. This model has been trained on lots of data (~124 million tweets) and uses the "knowledge" it
has "learned" about that data to make predictions about new data that we show it. It categorises texts as "Positive", "Negative", or "Neutral".

**We have not yet evaluated its performance
on our data**, and so the sentiment predictions should be taken with a pinch of salt.

### Topic modelling
We used the popular library [BERTopic](https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.get_representative_docs) to
extract topics from the data.

This method works by:
* Embedding the text data - this means translating the text into a numerical representation (vectors) that a computer can understand.
* Applying a clustering algorithm to the vectors. This works to group together similar vectors (and therefore similar text) into clusters.
* Extracting the key words that distinguish each cluster from the others.

We then also used an LLM (llama3.2) to give each cluster a name and description.
"""
),
# html.Ul(
# [
# html.Li("We did some cleaning of the text, for example, grouping together responses that were sent immediately after one another."),
# html.Li("Currently, "),
# html.Li("User Response Mapping: Visualize user responses on a scatter plot."),
# html.Li("Click on Points: Select specific points to view detailed user responses."),
# ]
# ),
],
style=CONTENT_STYLE,
)
Loading