#60 implemented whishper_timestamped model and llama 3.2 for action i…

…tmes
joshsoftware · Dec 28, 2024 · 4808cba · 4808cba
1 parent 4c028fe
commit 4808cba
Show file tree

Hide file tree

Showing 16 changed files with 378 additions and 8 deletions.
diff --git a/app.py b/app.py
@@ -99,9 +99,14 @@ def summarize_using_openai(text):
 #trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/3c714bc6-f728-48b6-813c-a77a8d281a7e-gfje9d.mp3")
 #trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/d3c3c169-02b7-4b70-a3e2-8f62514f5433-gfje9d.mp3")
 print(trasnslation["text"])
+x = '{"tex":'+trasnslation["text"]+',"segments":['
 segments = trasnslation["segments"]
 for segment in segments:
+    x = x+'{"segement":{"start":'+segment["start"]+',"end":'+segment["end"]+',"text":'+segment["text"]+'},'
     txt = "{0} - {1} : {2}".format(segment["start"],segment["end"],segment["text"])
     print(txt)
+x = x+'],'
 out = summarize_using_llama(trasnslation["text"])
+x = x+'"summary":'+out+'}'
+print(x)
 print(out)
diff --git a/cli/app.py b/cli/app.py
@@ -0,0 +1,112 @@
+from dotenv import load_dotenv
+import whisper
+import ollama
+import logging
+from logger import logger
+import openai
+import whisper_timestamped as whisper_ts
+import json
+import datetime
+
+# Load environment variables
+load_dotenv()
+
+# Import configurations and functions from modules
+from config import openai_api_key, model_id, model_path
+from load_model import load_model
+#from extract_entities import extract_entities
+
+openai.api_key = openai_api_key
+#Load whisher model
+model = load_model(model_id, model_path, True)
+
+
+#transcripe the audio to its original language
+def process_all_steps(audio):
+    #transcription =transcribe(audio)
+    translation = translate_with_whisper(audio)
+    #translation = translate_with_ollama(transcription)
+    #summary = summarize_using_llama(translation)
+    summary = summarize_using_openai(translation)
+    #return [transcription, translation, summary]
+    return [translation, summary]
+
+def transcribe(audio):
+    logger.info("Started transciption")
+    result = model.transcribe(audio,fp16=False)
+    transcription = result["text"]
+    return transcription
+
+def transcribe_with_whisper_ts(audio_file):
+    audio = whisper_ts.load_audio(audio_file)
+    logger.info("Started transciption through whishper")
+    #as suggested in the document
+    options = dict(beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0))
+    translate_options = dict(task="translate", **options)
+    print(datetime.datetime.now())
+    result = whisper_ts.transcribe_timestamped(model,audio,condition_on_previous_text=False,vad=True,trust_whisper_timestamps=False,**translate_options)
+    print(datetime.datetime.now())
+    #result = whisper_ts.transcribe(model, audio)
+    return result
+
+
+#translate the audio file to English language using whisper model
+def translate_with_whisper(audio): 
+    logger.info("Started transciption through whishper")
+    options = dict(beam_size=5, best_of=5)
+    translate_options = dict(task="translate", **options)
+    result = model.transcribe(audio,**translate_options)
+    return result["text"]
+
+#translate the text from transciption to English language
+def translate_with_ollama(text):
+    logger.info("Started transciption through llama")
+    response = ollama.generate(model= "llama3.2", prompt = "Translate the following text to English:"+text+"\n SUMMARY:\n")
+    translation = response["response"]
+    return translation
+
+#Using Ollama and llama3.1 modle, summarize the English translation
+def summarize_using_llama(text):
+    response = ollama.generate(model= "llama3.2", prompt = "Provide highlights of conversion inbullet points without pretext:"+text+"\n \n")
+    summary = response["response"]
+    return summary
+
+
+#Using openaie, summarize the English translation
+def summarize_using_openai(text):
+    logger.info("Started summarization")
+    prompt = "Summarize the following text: " +text
+    try:
+        response = openai.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=500
+        )
+        summary  = response.choices[0].message.content
+    except Exception as e:
+        logger.error(e)
+        summary = "Unable to  exract summary"
+    return summary
+
+text="It's like a dialogue in a movie. They don't believe if you say you are going to win. They believe only if you say you have won. It's very difficult to get a name in India. If you win in sports, everyone will be able to say the name you have won. How is this situation for you? We have been training for 4 years. In 4 years, I have been to many national meet like this. But at that time, I have only won bronze, silver and gold. In this meet, I have won my first gold. For this, We worked very hard for a year and achieved this success. Superb! How did your journey start? Tell us about your family. I don't have a father in my family. I have only my mother. My mother is a farmer. I have two sisters. When I was in 8th or 9th grade, I ran a school sports relay. At that time, my school PD sir took me to the district division. I won medals in that. But I didn't win medals even at the state level. At that time, I was not doing any training. I went to Koko training after coming to college. I was in Koko training for 3 years. After that, I came to Athletics school. My coach's name is Manikandan Arumugam. I trained with her for 4 years and now I am fully involved in Athletics. Superb! Superb! They say one important thing. No matter what sport you play, if you get angry, you can't win. You were talking about your coach, Manikandan Arumugam, correct? You tell about him. He is also an Athlete Sir. He is working in Southern Railway. He has been medalist for 10 years in National level. He has kept his rank for 10 years."
+
+#Marathi audio
+#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/9ed82ee5-4dd9-4eeb-8f77-9a1dfbf35bc2-gfje9d.mp3")
+#Tamil audio
+#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/3c714bc6-f728-48b6-813c-a77a8d281a7e-gfje9d.mp3")
+#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/d3c3c169-02b7-4b70-a3e2-8f62514f5433-gfje9d.mp3")
+#out = summarize_using_llama(trasnslation["text"])
+out = summarize_using_llama(text)
+'''segs = []
+seg = {}
+segments = trasnslation["segments"]
+for segment in segments:
+    seg = {"start":segment["start"],"end":segment["end"],"text":segment["text"]}
+   
+    segs.append(seg)
+result = {"text":trasnslation["text"], "segments": segs, "summary":out}
+print(result)'''
+print(out)
diff --git a/cli/config.py b/cli/config.py
@@ -0,0 +1,9 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+openai_api_key = os.getenv("OPENAI_KEY")
+#model_id = os.getenv('MODEL_ID', 'large-v3')
+model_id = os.getenv('MODEL_ID')
+model_path = os.getenv('MODEL_PATH')
diff --git a/cli/download_model.py b/cli/download_model.py
@@ -0,0 +1,25 @@
+import sys
+
+# # Check if two command-line arguments are provided
+if len(sys.argv) !=3:
+    print("Usage: python download_model.py <whisper_model_id> <whisper_model_output_path>")
+    print("Example: python download_model.py large-v3 /workspace/whisper-model/")
+    sys.exit(1)
+
+# Check if the model path ends with '/'
+model_path = sys.argv[2]
+if not model_path.endswith('/'):
+    model_path += '/'
+
+### Download the model in a local directory - Specify the version you want to use in the first parameter
+import whisper
+model_id = sys.argv[1]
+model_path = f'{model_path}{model_id}'
+# Available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']
+
+# The whisper module’s load_model() method loads a whisper model in your Python application. You must pass the model name as a parameter to the load_model() method.
+try:
+    model = whisper.load_model(model_id, download_root=model_path)
+    print("Model has successfully been downloaded")
+except Exception as e:
+    print(f"Error downloading the model: {e}")
diff --git a/cli/download_whisper.py b/cli/download_whisper.py
@@ -0,0 +1,9 @@
+import whisper
+import os
+
+model_path = "whisper_model"
+model_id = 'large-v3'
+os.makedirs(model_path, exist_ok=True)
+
+# Download model 
+model = whisper.load_model(model_id, download_root=model_path)
diff --git a/cli/extract_entities.py b/cli/extract_entities.py
@@ -0,0 +1,84 @@
+#import openai
+from config import openai_api_key
+
+#openai.api_key = openai_api_key
+
+def extract_entities(text):
+    prompt = f"""
+    The following entities are present in Indian Languages.
+    Please extract the following entities from the text:
+    Name, pin code, phone number, gender, occupation, and address.
+
+    Provide the summary of the text in exact below format:
+    Name is ......., pin code is ........, phone number is ........, gender is ........, occupation is ........, Address is ............ .
+
+    Text: "{text}"
+    
+    Summary:
+
+
+    Detailed view:
+
+    Original language: {text}
+
+    Text: "{text}"
+    
+    Summary:
+
+    Detailed view:
+
+    Original language: {text}
+
+    """
+
+    try:
+        response = openai.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=500
+        )
+        response_text = response.choices[0].message.content
+    except Exception as e:
+        return f"Error during OpenAI API call: {e}", "Detailed view not available."
+
+    # Process the response to extract summary and detailed transcription
+    if "Detailed view:" in response_text:
+        parts = response_text.split("Detailed view:")
+        summary_part = parts[0].strip()
+        detailed_transcription_part = parts[1].strip()
+    else:
+        summary_part = response_text.strip()
+        detailed_transcription_part = "Detailed view not provided."
+
+    # Format the summary and detailed transcription
+    formatted_summary = format_summary(summary_part)
+    formatted_detailed_transcription = format_detailed_transcription(detailed_transcription_part)
+
+    return formatted_summary, formatted_detailed_transcription
+
+def format_summary(summary):
+    # Process the summary to remove unnecessary parts
+    lines = summary.split('\n')
+    summary_lines = []
+    is_summary_section = False
+
+    for line in lines:
+        line = line.strip()
+        if line.startswith("Summary:"):
+            is_summary_section = True
+            continue
+        if is_summary_section:
+            summary_lines.append(line)
+
+    formatted_summary = ' '.join(summary_lines)
+    return formatted_summary
+
+def format_detailed_transcription(detailed_transcription):
+    # Process the detailed transcription to ensure proper formatting
+    lines = detailed_transcription.split('\n')
+    detailed_lines = [line.strip() for line in lines if line.strip()]
+    formatted_detailed_transcription = '\n'.join(detailed_lines)
+    return formatted_detailed_transcription
diff --git a/cli/load_model.py b/cli/load_model.py
@@ -0,0 +1,18 @@
+import torch
+import whisper
+import whisper_timestamped
+
+#load the whisper model from net if it isn't stored locally
+def load_model(model_id, model_path, is_ts):
+    #check GPU is avaialbe
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    #device = "cpu"
+    if (is_ts):
+        model = whisper_timestamped.load_model(model_id, device=device, download_root=model_path)
+    else:
+        model = whisper.load_model(model_id, device=device, download_root=model_path)
+    print(
+        f"Model will be run on {device}\n"
+        f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
+    )
+    return model
diff --git a/cli/logger.py b/cli/logger.py
@@ -0,0 +1,16 @@
+from os import path
+import logging
+import logging.config
+
+#log_file_path = path.join(path.dirname(path.abspath(__file__)), 'log.config')
+#logging.config.fileConfig(log_file_path)
+
+# create logger
+logger = logging.getLogger('simpleExample')
+
+# 'application' code
+logger.debug('debug message')
+logger.info('info message')
+logger.warning('warn message')
+logger.error('error message')
+logger.critical('critical message')
diff --git a/cli/logging.connf b/cli/logging.connf
@@ -0,0 +1,27 @@
+[loggers]
+keys=root,simpleExample
+
+[handlers]
+keys=consoleHandler
+
+[formatters]
+keys=simpleFormatter
+
+[logger_root]
+level=DEBUG
+handlers=consoleHandler
+
+[logger_simpleExample]
+level=DEBUG
+handlers=consoleHandler
+qualname=simpleExample
+propagate=0
+
+[handler_consoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=simpleFormatter
+args=(sys.stdout,)
+
+[formatter_simpleFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
diff --git a/service/audio_service.py b/service/audio_service.py
@@ -1,9 +1,42 @@
+from scipy import misc
+
+
+# @misc{lintoai2023whispertimestamped,
+#   title={whisper-timestamped},
+#   author={Louradour, J{\'e}r{\^o}me},
+#   journal={GitHub repository},
+#   year={2023},
+#   publisher={GitHub},
+#   howpu@misc{lintoai2023whispertimestamped,
+#   title={whisper-timestamped},
+#   author={Louradour, J{\'e}r{\^o}me},
+#   journal={GitHub repository},
+#   year={2023},
+#   publisher={GitHub},
+#   howpublished = {\url{https://github.com/linto-ai/whisper-timestamped}}
+# }
+# @article{radford2022robust,
+#   title={Robust speech recognition via large-scale weak supervision},
+#   author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
+#   journal={arXiv preprint arXiv:2212.04356},
+#   year={2022}
+# }
+# @article{JSSv031i07,
+#   title={Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package},
+#   author={Giorgino, Toni},
+#   journal={Journal of Statistical Software},
+#   year={2009},
+#   volume={31},
+#   number={7},
+#   doi={10.18637/jss.v031.i07}
+# }
 from fastapi import UploadFile
 import openai
 from dotenv import load_dotenv
 from config import openai_api_key, model_id, model_path
 from load_model import load_model
 import logging
+import whisper_timestamped as whisper_ts
 
 logging.basicConfig(level=logging.INFO)  # Set the logging level
 logger = logging.getLogger(__name__)
@@ -13,7 +46,7 @@
 openai.api_key = openai_api_key
 #Load whisher model
 logger.info("Loading model...")
-model = load_model(model_id, model_path)
+model = load_model(model_id, model_path=model_path,is_ts=True)
 
 #translate the audio file to English language using whisper model
 def translate_with_whisper(audioPath):
@@ -23,3 +56,11 @@ def translate_with_whisper(audioPath):
     result = model.transcribe(audioPath,**translate_options)
     return result["text"]
 
+#translate the audio file to English language using whisper timestamp model
+def translate_with_whisper_timestamped(audioPath):
+    logger.info("translation started")
+    options = dict(beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0))
+    translate_options = dict(task="translate", **options)
+    result = whisper_ts.transcribe_timestamped(model,audioPath,condition_on_previous_text=False,vad=True,trust_whisper_timestamps=False,**translate_options)
+    return result
+