Skip to content

Commit

Permalink
#60 implemented whishper_timestamped model and llama 3.2 for action i…
Browse files Browse the repository at this point in the history
…tmes
  • Loading branch information
sethu committed Dec 28, 2024
1 parent 4c028fe commit 4808cba
Show file tree
Hide file tree
Showing 16 changed files with 378 additions and 8 deletions.
5 changes: 5 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,14 @@ def summarize_using_openai(text):
#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/3c714bc6-f728-48b6-813c-a77a8d281a7e-gfje9d.mp3")
#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/d3c3c169-02b7-4b70-a3e2-8f62514f5433-gfje9d.mp3")
print(trasnslation["text"])
x = '{"tex":'+trasnslation["text"]+',"segments":['
segments = trasnslation["segments"]
for segment in segments:
x = x+'{"segement":{"start":'+segment["start"]+',"end":'+segment["end"]+',"text":'+segment["text"]+'},'
txt = "{0} - {1} : {2}".format(segment["start"],segment["end"],segment["text"])
print(txt)
x = x+'],'
out = summarize_using_llama(trasnslation["text"])
x = x+'"summary":'+out+'}'
print(x)
print(out)
112 changes: 112 additions & 0 deletions cli/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from dotenv import load_dotenv
import whisper
import ollama
import logging
from logger import logger
import openai
import whisper_timestamped as whisper_ts
import json
import datetime

# Load environment variables
load_dotenv()

# Import configurations and functions from modules
from config import openai_api_key, model_id, model_path
from load_model import load_model
#from extract_entities import extract_entities

openai.api_key = openai_api_key
#Load whisher model
model = load_model(model_id, model_path, True)


#transcripe the audio to its original language
def process_all_steps(audio):
#transcription =transcribe(audio)
translation = translate_with_whisper(audio)
#translation = translate_with_ollama(transcription)
#summary = summarize_using_llama(translation)
summary = summarize_using_openai(translation)
#return [transcription, translation, summary]
return [translation, summary]

def transcribe(audio):
logger.info("Started transciption")
result = model.transcribe(audio,fp16=False)
transcription = result["text"]
return transcription

def transcribe_with_whisper_ts(audio_file):
audio = whisper_ts.load_audio(audio_file)
logger.info("Started transciption through whishper")
#as suggested in the document
options = dict(beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0))
translate_options = dict(task="translate", **options)
print(datetime.datetime.now())
result = whisper_ts.transcribe_timestamped(model,audio,condition_on_previous_text=False,vad=True,trust_whisper_timestamps=False,**translate_options)
print(datetime.datetime.now())
#result = whisper_ts.transcribe(model, audio)
return result


#translate the audio file to English language using whisper model
def translate_with_whisper(audio):
logger.info("Started transciption through whishper")
options = dict(beam_size=5, best_of=5)
translate_options = dict(task="translate", **options)
result = model.transcribe(audio,**translate_options)
return result["text"]

#translate the text from transciption to English language
def translate_with_ollama(text):
logger.info("Started transciption through llama")
response = ollama.generate(model= "llama3.2", prompt = "Translate the following text to English:"+text+"\n SUMMARY:\n")
translation = response["response"]
return translation

#Using Ollama and llama3.1 modle, summarize the English translation
def summarize_using_llama(text):
response = ollama.generate(model= "llama3.2", prompt = "Provide highlights of conversion inbullet points without pretext:"+text+"\n \n")
summary = response["response"]
return summary


#Using openaie, summarize the English translation
def summarize_using_openai(text):
logger.info("Started summarization")
prompt = "Summarize the following text: " +text
try:
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
{"role": "user", "content": prompt}
],
max_tokens=500
)
summary = response.choices[0].message.content
except Exception as e:
logger.error(e)
summary = "Unable to exract summary"
return summary

text="It's like a dialogue in a movie. They don't believe if you say you are going to win. They believe only if you say you have won. It's very difficult to get a name in India. If you win in sports, everyone will be able to say the name you have won. How is this situation for you? We have been training for 4 years. In 4 years, I have been to many national meet like this. But at that time, I have only won bronze, silver and gold. In this meet, I have won my first gold. For this, We worked very hard for a year and achieved this success. Superb! How did your journey start? Tell us about your family. I don't have a father in my family. I have only my mother. My mother is a farmer. I have two sisters. When I was in 8th or 9th grade, I ran a school sports relay. At that time, my school PD sir took me to the district division. I won medals in that. But I didn't win medals even at the state level. At that time, I was not doing any training. I went to Koko training after coming to college. I was in Koko training for 3 years. After that, I came to Athletics school. My coach's name is Manikandan Arumugam. I trained with her for 4 years and now I am fully involved in Athletics. Superb! Superb! They say one important thing. No matter what sport you play, if you get angry, you can't win. You were talking about your coach, Manikandan Arumugam, correct? You tell about him. He is also an Athlete Sir. He is working in Southern Railway. He has been medalist for 10 years in National level. He has kept his rank for 10 years."

#Marathi audio
#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/9ed82ee5-4dd9-4eeb-8f77-9a1dfbf35bc2-gfje9d.mp3")
#Tamil audio
#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/3c714bc6-f728-48b6-813c-a77a8d281a7e-gfje9d.mp3")
#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/d3c3c169-02b7-4b70-a3e2-8f62514f5433-gfje9d.mp3")
#out = summarize_using_llama(trasnslation["text"])
out = summarize_using_llama(text)
'''segs = []
seg = {}
segments = trasnslation["segments"]
for segment in segments:
seg = {"start":segment["start"],"end":segment["end"],"text":segment["text"]}
segs.append(seg)
result = {"text":trasnslation["text"], "segments": segs, "summary":out}
print(result)'''
print(out)
9 changes: 9 additions & 0 deletions cli/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_KEY")
#model_id = os.getenv('MODEL_ID', 'large-v3')
model_id = os.getenv('MODEL_ID')
model_path = os.getenv('MODEL_PATH')
25 changes: 25 additions & 0 deletions cli/download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sys

# # Check if two command-line arguments are provided
if len(sys.argv) !=3:
print("Usage: python download_model.py <whisper_model_id> <whisper_model_output_path>")
print("Example: python download_model.py large-v3 /workspace/whisper-model/")
sys.exit(1)

# Check if the model path ends with '/'
model_path = sys.argv[2]
if not model_path.endswith('/'):
model_path += '/'

### Download the model in a local directory - Specify the version you want to use in the first parameter
import whisper
model_id = sys.argv[1]
model_path = f'{model_path}{model_id}'
# Available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']

# The whisper module’s load_model() method loads a whisper model in your Python application. You must pass the model name as a parameter to the load_model() method.
try:
model = whisper.load_model(model_id, download_root=model_path)
print("Model has successfully been downloaded")
except Exception as e:
print(f"Error downloading the model: {e}")
9 changes: 9 additions & 0 deletions cli/download_whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import whisper
import os

model_path = "whisper_model"
model_id = 'large-v3'
os.makedirs(model_path, exist_ok=True)

# Download model
model = whisper.load_model(model_id, download_root=model_path)
84 changes: 84 additions & 0 deletions cli/extract_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#import openai
from config import openai_api_key

#openai.api_key = openai_api_key

def extract_entities(text):
prompt = f"""
The following entities are present in Indian Languages.
Please extract the following entities from the text:
Name, pin code, phone number, gender, occupation, and address.
Provide the summary of the text in exact below format:
Name is ......., pin code is ........, phone number is ........, gender is ........, occupation is ........, Address is ............ .
Text: "{text}"
Summary:
Detailed view:
Original language: {text}
Text: "{text}"
Summary:
Detailed view:
Original language: {text}
"""

try:
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
{"role": "user", "content": prompt}
],
max_tokens=500
)
response_text = response.choices[0].message.content
except Exception as e:
return f"Error during OpenAI API call: {e}", "Detailed view not available."

# Process the response to extract summary and detailed transcription
if "Detailed view:" in response_text:
parts = response_text.split("Detailed view:")
summary_part = parts[0].strip()
detailed_transcription_part = parts[1].strip()
else:
summary_part = response_text.strip()
detailed_transcription_part = "Detailed view not provided."

# Format the summary and detailed transcription
formatted_summary = format_summary(summary_part)
formatted_detailed_transcription = format_detailed_transcription(detailed_transcription_part)

return formatted_summary, formatted_detailed_transcription

def format_summary(summary):
# Process the summary to remove unnecessary parts
lines = summary.split('\n')
summary_lines = []
is_summary_section = False

for line in lines:
line = line.strip()
if line.startswith("Summary:"):
is_summary_section = True
continue
if is_summary_section:
summary_lines.append(line)

formatted_summary = ' '.join(summary_lines)
return formatted_summary

def format_detailed_transcription(detailed_transcription):
# Process the detailed transcription to ensure proper formatting
lines = detailed_transcription.split('\n')
detailed_lines = [line.strip() for line in lines if line.strip()]
formatted_detailed_transcription = '\n'.join(detailed_lines)
return formatted_detailed_transcription
18 changes: 18 additions & 0 deletions cli/load_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import torch
import whisper
import whisper_timestamped

#load the whisper model from net if it isn't stored locally
def load_model(model_id, model_path, is_ts):
#check GPU is avaialbe
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
if (is_ts):
model = whisper_timestamped.load_model(model_id, device=device, download_root=model_path)
else:
model = whisper.load_model(model_id, device=device, download_root=model_path)
print(
f"Model will be run on {device}\n"
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
)
return model
16 changes: 16 additions & 0 deletions cli/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from os import path
import logging
import logging.config

#log_file_path = path.join(path.dirname(path.abspath(__file__)), 'log.config')
#logging.config.fileConfig(log_file_path)

# create logger
logger = logging.getLogger('simpleExample')

# 'application' code
logger.debug('debug message')
logger.info('info message')
logger.warning('warn message')
logger.error('error message')
logger.critical('critical message')
27 changes: 27 additions & 0 deletions cli/logging.connf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[loggers]
keys=root,simpleExample

[handlers]
keys=consoleHandler

[formatters]
keys=simpleFormatter

[logger_root]
level=DEBUG
handlers=consoleHandler

[logger_simpleExample]
level=DEBUG
handlers=consoleHandler
qualname=simpleExample
propagate=0

[handler_consoleHandler]
class=StreamHandler
level=DEBUG
formatter=simpleFormatter
args=(sys.stdout,)

[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
43 changes: 42 additions & 1 deletion service/audio_service.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,42 @@
from scipy import misc


# @misc{lintoai2023whispertimestamped,
# title={whisper-timestamped},
# author={Louradour, J{\'e}r{\^o}me},
# journal={GitHub repository},
# year={2023},
# publisher={GitHub},
# howpu@misc{lintoai2023whispertimestamped,
# title={whisper-timestamped},
# author={Louradour, J{\'e}r{\^o}me},
# journal={GitHub repository},
# year={2023},
# publisher={GitHub},
# howpublished = {\url{https://github.com/linto-ai/whisper-timestamped}}
# }
# @article{radford2022robust,
# title={Robust speech recognition via large-scale weak supervision},
# author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
# journal={arXiv preprint arXiv:2212.04356},
# year={2022}
# }
# @article{JSSv031i07,
# title={Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package},
# author={Giorgino, Toni},
# journal={Journal of Statistical Software},
# year={2009},
# volume={31},
# number={7},
# doi={10.18637/jss.v031.i07}
# }
from fastapi import UploadFile
import openai
from dotenv import load_dotenv
from config import openai_api_key, model_id, model_path
from load_model import load_model
import logging
import whisper_timestamped as whisper_ts

logging.basicConfig(level=logging.INFO) # Set the logging level
logger = logging.getLogger(__name__)
Expand All @@ -13,7 +46,7 @@
openai.api_key = openai_api_key
#Load whisher model
logger.info("Loading model...")
model = load_model(model_id, model_path)
model = load_model(model_id, model_path=model_path,is_ts=True)

#translate the audio file to English language using whisper model
def translate_with_whisper(audioPath):
Expand All @@ -23,3 +56,11 @@ def translate_with_whisper(audioPath):
result = model.transcribe(audioPath,**translate_options)
return result["text"]

#translate the audio file to English language using whisper timestamp model
def translate_with_whisper_timestamped(audioPath):
logger.info("translation started")
options = dict(beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0))
translate_options = dict(task="translate", **options)
result = whisper_ts.transcribe_timestamped(model,audioPath,condition_on_previous_text=False,vad=True,trust_whisper_timestamps=False,**translate_options)
return result

Loading

0 comments on commit 4808cba

Please sign in to comment.