diff --git a/app.py b/app.py index 348becc..d400715 100644 --- a/app.py +++ b/app.py @@ -99,9 +99,14 @@ def summarize_using_openai(text): #trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/3c714bc6-f728-48b6-813c-a77a8d281a7e-gfje9d.mp3") #trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/d3c3c169-02b7-4b70-a3e2-8f62514f5433-gfje9d.mp3") print(trasnslation["text"]) +x = '{"tex":'+trasnslation["text"]+',"segments":[' segments = trasnslation["segments"] for segment in segments: + x = x+'{"segement":{"start":'+segment["start"]+',"end":'+segment["end"]+',"text":'+segment["text"]+'},' txt = "{0} - {1} : {2}".format(segment["start"],segment["end"],segment["text"]) print(txt) +x = x+'],' out = summarize_using_llama(trasnslation["text"]) +x = x+'"summary":'+out+'}' +print(x) print(out) diff --git a/cli/app.py b/cli/app.py new file mode 100644 index 0000000..615b500 --- /dev/null +++ b/cli/app.py @@ -0,0 +1,112 @@ +from dotenv import load_dotenv +import whisper +import ollama +import logging +from logger import logger +import openai +import whisper_timestamped as whisper_ts +import json +import datetime + +# Load environment variables +load_dotenv() + +# Import configurations and functions from modules +from config import openai_api_key, model_id, model_path +from load_model import load_model +#from extract_entities import extract_entities + +openai.api_key = openai_api_key +#Load whisher model +model = load_model(model_id, model_path, True) + + +#transcripe the audio to its original language +def process_all_steps(audio): + #transcription =transcribe(audio) + translation = translate_with_whisper(audio) + #translation = translate_with_ollama(transcription) + #summary = summarize_using_llama(translation) + summary = summarize_using_openai(translation) + #return [transcription, translation, summary] + return [translation, summary] + +def transcribe(audio): + logger.info("Started transciption") + result = model.transcribe(audio,fp16=False) + transcription = result["text"] + return transcription + +def transcribe_with_whisper_ts(audio_file): + audio = whisper_ts.load_audio(audio_file) + logger.info("Started transciption through whishper") + #as suggested in the document + options = dict(beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) + translate_options = dict(task="translate", **options) + print(datetime.datetime.now()) + result = whisper_ts.transcribe_timestamped(model,audio,condition_on_previous_text=False,vad=True,trust_whisper_timestamps=False,**translate_options) + print(datetime.datetime.now()) + #result = whisper_ts.transcribe(model, audio) + return result + + +#translate the audio file to English language using whisper model +def translate_with_whisper(audio): + logger.info("Started transciption through whishper") + options = dict(beam_size=5, best_of=5) + translate_options = dict(task="translate", **options) + result = model.transcribe(audio,**translate_options) + return result["text"] + +#translate the text from transciption to English language +def translate_with_ollama(text): + logger.info("Started transciption through llama") + response = ollama.generate(model= "llama3.2", prompt = "Translate the following text to English:"+text+"\n SUMMARY:\n") + translation = response["response"] + return translation + +#Using Ollama and llama3.1 modle, summarize the English translation +def summarize_using_llama(text): + response = ollama.generate(model= "llama3.2", prompt = "Provide highlights of conversion inbullet points without pretext:"+text+"\n \n") + summary = response["response"] + return summary + + +#Using openaie, summarize the English translation +def summarize_using_openai(text): + logger.info("Started summarization") + prompt = "Summarize the following text: " +text + try: + response = openai.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."}, + {"role": "user", "content": prompt} + ], + max_tokens=500 + ) + summary = response.choices[0].message.content + except Exception as e: + logger.error(e) + summary = "Unable to exract summary" + return summary + +text="It's like a dialogue in a movie. They don't believe if you say you are going to win. They believe only if you say you have won. It's very difficult to get a name in India. If you win in sports, everyone will be able to say the name you have won. How is this situation for you? We have been training for 4 years. In 4 years, I have been to many national meet like this. But at that time, I have only won bronze, silver and gold. In this meet, I have won my first gold. For this, We worked very hard for a year and achieved this success. Superb! How did your journey start? Tell us about your family. I don't have a father in my family. I have only my mother. My mother is a farmer. I have two sisters. When I was in 8th or 9th grade, I ran a school sports relay. At that time, my school PD sir took me to the district division. I won medals in that. But I didn't win medals even at the state level. At that time, I was not doing any training. I went to Koko training after coming to college. I was in Koko training for 3 years. After that, I came to Athletics school. My coach's name is Manikandan Arumugam. I trained with her for 4 years and now I am fully involved in Athletics. Superb! Superb! They say one important thing. No matter what sport you play, if you get angry, you can't win. You were talking about your coach, Manikandan Arumugam, correct? You tell about him. He is also an Athlete Sir. He is working in Southern Railway. He has been medalist for 10 years in National level. He has kept his rank for 10 years." + +#Marathi audio +#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/9ed82ee5-4dd9-4eeb-8f77-9a1dfbf35bc2-gfje9d.mp3") +#Tamil audio +#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/3c714bc6-f728-48b6-813c-a77a8d281a7e-gfje9d.mp3") +#trasnslation = transcribe_with_whisper_ts("https://utfs.io/f/d3c3c169-02b7-4b70-a3e2-8f62514f5433-gfje9d.mp3") +#out = summarize_using_llama(trasnslation["text"]) +out = summarize_using_llama(text) +'''segs = [] +seg = {} +segments = trasnslation["segments"] +for segment in segments: + seg = {"start":segment["start"],"end":segment["end"],"text":segment["text"]} + + segs.append(seg) +result = {"text":trasnslation["text"], "segments": segs, "summary":out} +print(result)''' +print(out) diff --git a/cli/config.py b/cli/config.py new file mode 100644 index 0000000..96b92a9 --- /dev/null +++ b/cli/config.py @@ -0,0 +1,9 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +openai_api_key = os.getenv("OPENAI_KEY") +#model_id = os.getenv('MODEL_ID', 'large-v3') +model_id = os.getenv('MODEL_ID') +model_path = os.getenv('MODEL_PATH') diff --git a/cli/download_model.py b/cli/download_model.py new file mode 100644 index 0000000..2398a50 --- /dev/null +++ b/cli/download_model.py @@ -0,0 +1,25 @@ +import sys + +# # Check if two command-line arguments are provided +if len(sys.argv) !=3: + print("Usage: python download_model.py ") + print("Example: python download_model.py large-v3 /workspace/whisper-model/") + sys.exit(1) + +# Check if the model path ends with '/' +model_path = sys.argv[2] +if not model_path.endswith('/'): + model_path += '/' + +### Download the model in a local directory - Specify the version you want to use in the first parameter +import whisper +model_id = sys.argv[1] +model_path = f'{model_path}{model_id}' +# Available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large'] + +# The whisper module’s load_model() method loads a whisper model in your Python application. You must pass the model name as a parameter to the load_model() method. +try: + model = whisper.load_model(model_id, download_root=model_path) + print("Model has successfully been downloaded") +except Exception as e: + print(f"Error downloading the model: {e}") diff --git a/cli/download_whisper.py b/cli/download_whisper.py new file mode 100644 index 0000000..82e1c65 --- /dev/null +++ b/cli/download_whisper.py @@ -0,0 +1,9 @@ +import whisper +import os + +model_path = "whisper_model" +model_id = 'large-v3' +os.makedirs(model_path, exist_ok=True) + +# Download model +model = whisper.load_model(model_id, download_root=model_path) diff --git a/cli/extract_entities.py b/cli/extract_entities.py new file mode 100644 index 0000000..43dc6d6 --- /dev/null +++ b/cli/extract_entities.py @@ -0,0 +1,84 @@ +#import openai +from config import openai_api_key + +#openai.api_key = openai_api_key + +def extract_entities(text): + prompt = f""" + The following entities are present in Indian Languages. + Please extract the following entities from the text: + Name, pin code, phone number, gender, occupation, and address. + + Provide the summary of the text in exact below format: + Name is ......., pin code is ........, phone number is ........, gender is ........, occupation is ........, Address is ............ . + + Text: "{text}" + + Summary: + + + Detailed view: + + Original language: {text} + + Text: "{text}" + + Summary: + + Detailed view: + + Original language: {text} + + """ + + try: + response = openai.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."}, + {"role": "user", "content": prompt} + ], + max_tokens=500 + ) + response_text = response.choices[0].message.content + except Exception as e: + return f"Error during OpenAI API call: {e}", "Detailed view not available." + + # Process the response to extract summary and detailed transcription + if "Detailed view:" in response_text: + parts = response_text.split("Detailed view:") + summary_part = parts[0].strip() + detailed_transcription_part = parts[1].strip() + else: + summary_part = response_text.strip() + detailed_transcription_part = "Detailed view not provided." + + # Format the summary and detailed transcription + formatted_summary = format_summary(summary_part) + formatted_detailed_transcription = format_detailed_transcription(detailed_transcription_part) + + return formatted_summary, formatted_detailed_transcription + +def format_summary(summary): + # Process the summary to remove unnecessary parts + lines = summary.split('\n') + summary_lines = [] + is_summary_section = False + + for line in lines: + line = line.strip() + if line.startswith("Summary:"): + is_summary_section = True + continue + if is_summary_section: + summary_lines.append(line) + + formatted_summary = ' '.join(summary_lines) + return formatted_summary + +def format_detailed_transcription(detailed_transcription): + # Process the detailed transcription to ensure proper formatting + lines = detailed_transcription.split('\n') + detailed_lines = [line.strip() for line in lines if line.strip()] + formatted_detailed_transcription = '\n'.join(detailed_lines) + return formatted_detailed_transcription diff --git a/cli/load_model.py b/cli/load_model.py new file mode 100644 index 0000000..435b746 --- /dev/null +++ b/cli/load_model.py @@ -0,0 +1,18 @@ +import torch +import whisper +import whisper_timestamped + +#load the whisper model from net if it isn't stored locally +def load_model(model_id, model_path, is_ts): + #check GPU is avaialbe + device = "cuda" if torch.cuda.is_available() else "cpu" + #device = "cpu" + if (is_ts): + model = whisper_timestamped.load_model(model_id, device=device, download_root=model_path) + else: + model = whisper.load_model(model_id, device=device, download_root=model_path) + print( + f"Model will be run on {device}\n" + f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " + ) + return model diff --git a/cli/logger.py b/cli/logger.py new file mode 100644 index 0000000..b323b73 --- /dev/null +++ b/cli/logger.py @@ -0,0 +1,16 @@ +from os import path +import logging +import logging.config + +#log_file_path = path.join(path.dirname(path.abspath(__file__)), 'log.config') +#logging.config.fileConfig(log_file_path) + +# create logger +logger = logging.getLogger('simpleExample') + +# 'application' code +logger.debug('debug message') +logger.info('info message') +logger.warning('warn message') +logger.error('error message') +logger.critical('critical message') diff --git a/cli/logging.connf b/cli/logging.connf new file mode 100644 index 0000000..63c8dcf --- /dev/null +++ b/cli/logging.connf @@ -0,0 +1,27 @@ +[loggers] +keys=root,simpleExample + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=DEBUG +handlers=consoleHandler + +[logger_simpleExample] +level=DEBUG +handlers=consoleHandler +qualname=simpleExample +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s diff --git a/service/audio_service.py b/service/audio_service.py index c3d6df0..4f9debb 100644 --- a/service/audio_service.py +++ b/service/audio_service.py @@ -1,9 +1,42 @@ +from scipy import misc + + +# @misc{lintoai2023whispertimestamped, +# title={whisper-timestamped}, +# author={Louradour, J{\'e}r{\^o}me}, +# journal={GitHub repository}, +# year={2023}, +# publisher={GitHub}, +# howpu@misc{lintoai2023whispertimestamped, +# title={whisper-timestamped}, +# author={Louradour, J{\'e}r{\^o}me}, +# journal={GitHub repository}, +# year={2023}, +# publisher={GitHub}, +# howpublished = {\url{https://github.com/linto-ai/whisper-timestamped}} +# } +# @article{radford2022robust, +# title={Robust speech recognition via large-scale weak supervision}, +# author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya}, +# journal={arXiv preprint arXiv:2212.04356}, +# year={2022} +# } +# @article{JSSv031i07, +# title={Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package}, +# author={Giorgino, Toni}, +# journal={Journal of Statistical Software}, +# year={2009}, +# volume={31}, +# number={7}, +# doi={10.18637/jss.v031.i07} +# } from fastapi import UploadFile import openai from dotenv import load_dotenv from config import openai_api_key, model_id, model_path from load_model import load_model import logging +import whisper_timestamped as whisper_ts logging.basicConfig(level=logging.INFO) # Set the logging level logger = logging.getLogger(__name__) @@ -13,7 +46,7 @@ openai.api_key = openai_api_key #Load whisher model logger.info("Loading model...") -model = load_model(model_id, model_path) +model = load_model(model_id, model_path=model_path,is_ts=True) #translate the audio file to English language using whisper model def translate_with_whisper(audioPath): @@ -23,3 +56,11 @@ def translate_with_whisper(audioPath): result = model.transcribe(audioPath,**translate_options) return result["text"] +#translate the audio file to English language using whisper timestamp model +def translate_with_whisper_timestamped(audioPath): + logger.info("translation started") + options = dict(beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) + translate_options = dict(task="translate", **options) + result = whisper_ts.transcribe_timestamped(model,audioPath,condition_on_previous_text=False,vad=True,trust_whisper_timestamps=False,**translate_options) + return result + diff --git a/service/load_model.py b/service/load_model.py index b538e98..79854ce 100644 --- a/service/load_model.py +++ b/service/load_model.py @@ -2,11 +2,11 @@ import whisper #load the whisper model from net if it isn't stored locally -def load_model(model_id, model_path): +def load_model(model_id, model_path, is_ts): #check GPU is avaialbe device = "cuda" if torch.cuda.is_available() else "cpu" #device = "cpu" - model = whisper.load_model(model_id, device=device, download_root=model_path) + model = whisper.load_model(model_id, device=device, download_root=model_path,) print( f"Model will be run on {device}\n" f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " diff --git a/service/main.py b/service/main.py index bee66c1..125a433 100644 --- a/service/main.py +++ b/service/main.py @@ -3,8 +3,10 @@ from logger import logger from dotenv import load_dotenv from starlette.middleware.cors import CORSMiddleware -from audio_service import translate_with_whisper -from summarizer import summarize_using_openai +#from audio_service import translate_with_whisper +from audio_service import translate_with_whisper_timestamped +#from summarizer import summarize_using_openai +from summarizer import summarize_using_ollama from pydantic import BaseModel app = FastAPI() @@ -36,11 +38,11 @@ async def upload_audio(body: Body): logger.error("invalid file type") return JSONResponse(status_code=400, content={"message":"Invalid file type"}) #translation = translate_with_whisper(transcription) - translation = translate_with_whisper(body.audio_file_link) + translation = translate_with_whisper_timestamped(body.audio_file_link) logger.info("translation done") #summary = summarize_using_openai(translation) - summary = summarize_using_openai(translation) + summary = summarize_using_ollama(translation) logger.info("summary done") diff --git a/service/service/conversation_diarization/temp_outputs/b93b9139-e902-4f5f-8810-f0df64e76256/mono_file.wav b/service/service/conversation_diarization/temp_outputs/b93b9139-e902-4f5f-8810-f0df64e76256/mono_file.wav new file mode 100644 index 0000000..0f6940d Binary files /dev/null and b/service/service/conversation_diarization/temp_outputs/b93b9139-e902-4f5f-8810-f0df64e76256/mono_file.wav differ diff --git a/service/summarizer.py b/service/summarizer.py index f3c4085..ab7c9eb 100644 --- a/service/summarizer.py +++ b/service/summarizer.py @@ -3,6 +3,7 @@ from langchain.schema.runnable.base import RunnableSequence from template_config import get_summarization_template import logging +import ollama logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -37,4 +38,9 @@ def summarize_using_openai(text): except Exception as e: logger.error(f"Error occurred during summarization: {str(e)}") return "An error occurred while summarizing the text." - + +#Using Ollama and llama3.2 model, summarize the English translation +def summarize_using_llama(text): + response = ollama.generate(model= "llama3.2", prompt = "Provide highlights of conversion inbullet points:"+text+"\n \n") + summary = response["response"] + return summary diff --git a/service/util.py b/service/util.py new file mode 100644 index 0000000..9b67477 --- /dev/null +++ b/service/util.py @@ -0,0 +1,13 @@ +import json + +def generate_timestamp_jon(translation,summary): + segs = [] + seg = {} + segments = translation["segments"] + for segment in segments: + seg = {"start":segment["start"],"end":segment["end"],"text":segment["text"]} + segs.append(seg) + + result = {"message": "File processed successfully!","translation":translation["text"], "segments": segs, "summary":summary} + return result + diff --git a/test.py b/test.py new file mode 100644 index 0000000..8defebc --- /dev/null +++ b/test.py @@ -0,0 +1,3 @@ +json = {"start": 59.22,"end": 59.22,"txt": "somthing went right"} +txt = "{0} - {1} : {2}".format(json["start"], json["end"], json["txt"]) +print(txt)