-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
71 lines (55 loc) · 2.83 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import HuggingFacePipeline, PromptTemplate
import transformers
from langchain.chains import ConversationalRetrievalChain
folder_name = "sample_code"
target_repository = os.getenv("TARGET_REPOSITORY", "https://github.com/V-Gutierrez/genesisapp-be")
os.system(f"git clone {target_repository} {folder_name}")
documents = []
print("\033[1;32mReading files...\033[0m")
for root, dirs, files in os.walk(folder_name):
for file in files:
try:
with open(os.path.join(root, file), "r", encoding="utf-8") as o:
code = o.readlines()
d = Document(page_content="\n".join(code), metadata={"source": os.path.join(root, file)})
documents.append(d)
except UnicodeDecodeError:
# some files are not utf-8 encoded; let's ignore them for now.
pass
print(f"Generated Documents: {len(documents)}")
print("\033[1;34mCreating vector store...\033[0m")
huggingface_embeddings = HuggingFaceEmbeddings(model_name="krlvi/sentence-t5-base-nlpl-code-x-glue")
persist_directory = "db"
db = Chroma.from_documents(documents, huggingface_embeddings, persist_directory=persist_directory)
db.persist()
retriever = db.as_retriever()
print("\033[1;35mCreating pipeline...\033[0m")
model_id = "stabilityai/stable-code-3b"
config = transformers.AutoConfig.from_pretrained(model_id,trust_remote_code=True)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
pipe = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
llm = HuggingFacePipeline(pipeline=pipe)
condense_question_template = """
All the questions are about the codebase embedded in the documents.
Chat History: {chat_history}
Follow Up question: {question}
Standalone question:
"""
condense_question_prompt = PromptTemplate.from_template(condense_question_template)
qa_chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever,return_source_documents=True, condense_question_prompt=condense_question_prompt)
while True:
user_question = input("\033[1;36mMake your question about the codebase (or say `quit`): \n\033[0m")
if user_question.lower() == 'quit':
os.system('cls' if os.name == 'nt' else 'clear')
print("Goodbye!")
break
print("\033[1;33mAnswering...\033[0m")
result = qa_chain({"question":user_question, "chat_history":[]})
print("\033[1;36mAnswer:", result['answer'], "\033[0m")
print(f"Sources: {[x.metadata['source'] for x in result['source_documents']]}")