From 75e530254c2e4f4c9b31a3b6c903bcd9f8c8256d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 4 Nov 2024 15:50:00 +0100 Subject: [PATCH] Refactored naming code and switched to SmolLM-1.7b --- turftopic/namers/base.py | 51 +++++------------- turftopic/namers/hf_transformers.py | 84 ++++------------------------- 2 files changed, 24 insertions(+), 111 deletions(-) diff --git a/turftopic/namers/base.py b/turftopic/namers/base.py index 62249ee..97d2ee2 100644 --- a/turftopic/namers/base.py +++ b/turftopic/namers/base.py @@ -1,29 +1,16 @@ from abc import ABC, abstractmethod -from typing import Optional -DEFAULT_POSITIVE_PROMPT = """ -You will be tasked with naming a topic. -The topic is described by the following set of keywords: {positive}. - -Based on the keywords, create a short label that best summarizes the topics. -The topic name should be at maximum three words long. -Only respond with a short topic name and nothing else. -""" +from rich.progress import track -DEFAULT_NEGATIVE_PROMPT = """ +DEFAULT_PROMPT = """ You will be tasked with naming a topic. -The topic is described with most relevant positive and negative terms. -Make sure to consider the negative terms as well when naming the topic. -An example of a topic name like this would be: Oriental vs. European Cuisine - -Positive terms: {positive} - -Negative terms: {negative} +Based on the keywords, create a short label that best summarizes the topics. +Only respond with a short, human readable topic name and nothing else. -Based on the keywords, create a short label (5 words maximum) that best summarizes the topics. -Only respond with the topic name and nothing else. +The topic is described by the following set of keywords: {keywords}. """ + DEFAULT_SYSTEM_PROMPT = """ You are a topic namer. When the user gives you a set of keywords, you respond with a name for the topic they describe. You only repond briefly with the name of the topic, and nothing else. @@ -34,18 +21,14 @@ class TopicNamer(ABC): @abstractmethod def name_topic( self, - positive: list[str], - negative: Optional[list[str]] = None, + keywords: list[str], ) -> str: """Names one topics based on top descriptive terms. Parameters ---------- - positive: list[str] + keywords: list[str] Top K highest ranking terms on the topic. - negative: list[str], default None - Top K lowest ranking terms on the topic. - (this is only relevant in the context of $S^3$) Returns ------- @@ -56,27 +39,21 @@ def name_topic( def name_topics( self, - positive: list[list[str]], - negative: Optional[list[list[str]]] = None, + keywords: list[list[str]], ) -> list[str]: """Names all topics based on top descriptive terms. Parameters ---------- - positive: list[list[str]] + keywords: list[list[str]] Top K highest ranking terms on the topics. - negative: list[list[str]], default None - Top K lowest ranking terms on the topics - (this is only relevant in the context of $S^3$) Returns ------- list[str] Topic names returned by the namer. """ - if negative is not None: - return [ - self.name_topic(pos, neg) - for pos, neg in zip(positive, negative) - ] - return [self.name_topic(pos) for pos in positive] + names = [] + for keys in track(keywords, description="Naming topics..."): + names.append(self.name_topic(keys)) + return names diff --git a/turftopic/namers/hf_transformers.py b/turftopic/namers/hf_transformers.py index dd4d332..517cc26 100644 --- a/turftopic/namers/hf_transformers.py +++ b/turftopic/namers/hf_transformers.py @@ -1,74 +1,18 @@ -from typing import Optional +from transformers import pipeline -from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline +from turftopic.namers.base import (DEFAULT_PROMPT, DEFAULT_SYSTEM_PROMPT, + TopicNamer) -from turftopic.namers.base import ( - DEFAULT_NEGATIVE_PROMPT, - DEFAULT_POSITIVE_PROMPT, - DEFAULT_SYSTEM_PROMPT, - TopicNamer, -) - -class Text2TextTopicNamer(TopicNamer): - """Name topics with a Text2Text model (e.g. Google's T5). - - Parameters - ---------- - model_name: str, default 'google/flan-t5-large' - Model to load from :hugs: Hub. - prompt_template: str - Prompt template to use when no negative terms are specified. - axis_prompt_template: str - Prompt template to use when negative terms are also specified. - device: str, default 'cpu' - Device to run the model on. - """ - - def __init__( - self, - model_name: str = "google/flan-t5-large", - prompt_template: str = DEFAULT_POSITIVE_PROMPT, - axis_prompt_template: str = DEFAULT_NEGATIVE_PROMPT, - device: str = "cpu", - ): - self.model_name = model_name - self.prompt_template = prompt_template - self.axis_prompt_template = axis_prompt_template - self.device = device - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to( - self.device - ) - - def name_topic( - self, - positive: list[list[str]], - negative: Optional[list[list[str]]] = None, - ) -> str: - if negative is not None: - prompt = self.axis_prompt_template.format( - positive=", ".join(positive), negative=", ".join(negative) - ) - else: - prompt = self.prompt_template.format(positive=", ".join(positive)) - inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) - output = self.model.generate(**inputs, max_new_tokens=24) - label = self.tokenizer.decode(output[0], skip_special_tokens=True) - return label - - -class ChatTopicNamer(TopicNamer): - """Name topics with a Chat model, e.g. Zephyr-7b-beta +class LLMTopicNamer(TopicNamer): + """Name topics with an instruction-finetuned LLM, e.g. Zephyr-7b-beta Parameters ---------- - model_name: str, default 'HuggingFaceH4/zephyr-7b-beta' + model_name: str, default 'HuggingFaceTB/SmolLM2-1.7B-Instruct' Model to load from :hugs: Hub. prompt_template: str Prompt template to use when no negative terms are specified. - axis_prompt_template: str - Prompt template to use when negative terms are also specified. system_prompt: str System prompt to use for the language model. device: str, default 'cpu' @@ -77,15 +21,13 @@ class ChatTopicNamer(TopicNamer): def __init__( self, - model_name: str = "HuggingFaceH4/zephyr-7b-beta", - prompt_template: str = DEFAULT_POSITIVE_PROMPT, - axis_prompt_template: str = DEFAULT_NEGATIVE_PROMPT, + model_name: str = "HuggingFaceTB/SmolLM2-1.7B-Instruct", + prompt_template: str = DEFAULT_PROMPT, system_prompt: str = DEFAULT_SYSTEM_PROMPT, device: str = "cpu", ): self.model_name = model_name self.prompt_template = prompt_template - self.axis_prompt_template = axis_prompt_template self.system_prompt = system_prompt self.device = device self.pipe = pipeline( @@ -94,15 +36,9 @@ def __init__( def name_topic( self, - positive: list[list[str]], - negative: Optional[list[list[str]]] = None, + keywords: list[list[str]], ) -> str: - if negative is not None: - prompt = self.axis_prompt_template.format( - positive=", ".join(positive), negative=", ".join(negative) - ) - else: - prompt = self.prompt_template.format(positive=", ".join(positive)) + prompt = self.prompt_template.format(keywords=", ".join(keywords)) messages = [ {"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt},