Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add init code #9

Merged
merged 1 commit into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Global configs:
USE_AZURE=True
MAX_RETRY=10
RETRY_WAIT_SECONDS=20
DUMP_CHAT_CACHE=True
USE_CHAT_CACHE=True
DUMP_EMBEDDING_CACHE=True
USE_EMBEDDING_CACHE=True
LOG_LLM_CHAT_CONTENT=False
CHAT_FREQUENCY_PENALTY=0.0
CHAT_PRESENCE_PENALTY=0.0

# embedding model configs:
EMBEDDING_OPENAI_API_KEY=your_api_key
EMBEDDING_AZURE_API_BASE=your_api_base
EMBEDDING_AZURE_API_VERSION=your_api_version
EMBEDDING_MODEL=text-embedding-3-small


# chat model configs:
CHAT_OPENAI_API_KEY=your_api_key # 5c
CHAT_AZURE_API_BASE=your_api_base
CHAT_AZURE_API_VERSION=your_api_version
CHAT_MODEL=your_model_version
CHAT_MAX_TOKENS=3000
CHAT_TEMPERATURE=0.7
CHAT_STREAM=True
24 changes: 24 additions & 0 deletions rdagent/app/CI/ci.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "rdagent",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# %%
from document_process.document_reader import load_and_process_pdfs_by_langchain, classify_report_from_dict
from dotenv import load_dotenv
from oai.llm_utils import APIBackend
from pathlib import Path
import json


from document_process.document_analysis import extract_factors_from_report_dict_and_classify_result
from document_process.document_analysis import check_factor_dict_viability
from document_process.document_analysis import deduplicate_factors_several_times


def extract_factors_and_implement(report_file_path: str):
assert load_dotenv()
api = APIBackend()
docs_dict_select = load_and_process_pdfs_by_langchain(Path(report_file_path))

selected_report_dict = classify_report_from_dict(report_dict=docs_dict_select, api=api, vote_time=1)
file_to_factor_result = extract_factors_from_report_dict_and_classify_result(docs_dict_select, selected_report_dict)

factor_dict = {}
for file_name in file_to_factor_result:
for factor_name in file_to_factor_result[file_name]:
factor_dict.setdefault(factor_name, [])
factor_dict[factor_name].append(file_to_factor_result[file_name][factor_name])

factor_dict_simple_deduplication = {}
for factor_name in factor_dict:
if len(factor_dict[factor_name]) > 1:
factor_dict_simple_deduplication[factor_name] = max(
factor_dict[factor_name], key=lambda x: len(x["formulation"])
)
else:
factor_dict_simple_deduplication[factor_name] = factor_dict[factor_name][0]
# %%

factor_viability = check_factor_dict_viability(factor_dict_simple_deduplication)
# json.dump(
# factor_viability,
# open(
# "factor_viability_all_reports.json",
# "w",
# ),
# indent=4,
# )

# factor_viability = json.load(
# open(
# "factor_viability_all_reports.json"
# )
# )

# %%

duplication_names_list = deduplicate_factors_several_times(factor_dict_simple_deduplication)
duplication_names_list = sorted(duplication_names_list, key=lambda x: len(x), reverse=True)
json.dump(duplication_names_list, open("duplication_names_list.json", "w"), indent=4)

# %%
factor_dict_viable = {
factor_name: factor_dict_simple_deduplication[factor_name]
for factor_name in factor_dict_simple_deduplication
if factor_viability[factor_name]["viability"]
}

to_replace_dict = {}
for duplication_names in duplication_names_list:
for duplication_factor_name in duplication_names[1:]:
to_replace_dict[duplication_factor_name] = duplication_names[0]

added_lower_name_set = set()
factor_dict_deduplication_with_llm = dict()
for factor_name in factor_dict_simple_deduplication:
if factor_name not in to_replace_dict and factor_name.lower() not in added_lower_name_set:
added_lower_name_set.add(factor_name.lower())
factor_dict_deduplication_with_llm[factor_name] = factor_dict_simple_deduplication[factor_name]

to_replace_viable_dict = {}
for duplication_names in duplication_names_list:
viability_list = [factor_viability[name]["viability"] for name in duplication_names]
if True not in viability_list:
continue
target_factor_name = duplication_names[viability_list.index(True)]
for duplication_factor_name in duplication_names:
if duplication_factor_name == target_factor_name:
continue
to_replace_viable_dict[duplication_factor_name] = target_factor_name

added_lower_name_set = set()
factor_dict_deduplication_with_llm_and_viable = dict()
for factor_name in factor_dict_viable:
if factor_name not in to_replace_viable_dict and factor_name.lower() not in added_lower_name_set:
added_lower_name_set.add(factor_name.lower())
factor_dict_deduplication_with_llm_and_viable[factor_name] = factor_dict_simple_deduplication[factor_name]

# %%

dump_md_list = [
[factor_dict_simple_deduplication, "final_factor_book"],
[factor_dict_viable, "final_viable_factor_book"],
[factor_dict_deduplication_with_llm, "final_deduplicated_factor_book"],
[factor_dict_deduplication_with_llm_and_viable, "final_deduplicated_viable_factor_book"],
]

for dump_md in dump_md_list:
factor_name_set = set()
current_index = 1
target_dict = dump_md[0]
json.dump(target_dict, open(f"{dump_md[1]}.json", "w"), indent=4)
with open(
rf"{dump_md[1]}.md",
"w",
) as fw:
for factor_name in target_dict:
formulation = target_dict[factor_name]["formulation"]
if factor_name in formulation:
target_factor_name = factor_name.replace("_", "\_")
formulation = formulation.replace(factor_name, target_factor_name)
for variable in target_dict[factor_name]["variables"]:
if variable in formulation:
target_variable = variable.replace("_", "\_")
formulation = formulation.replace(variable, target_variable)

fw.write(f"## {current_index}. 因子名称:{factor_name}\n")
fw.write(f"### Viability: {target_dict[factor_name]['viability']}\n")
fw.write(f"### Viability Reason: {target_dict[factor_name]['viability_reason']}\n")
fw.write(f"### description: {target_dict[factor_name]['description']}\n")
fw.write(f"### formulation: $$ {formulation} $$\n")
fw.write(f"### formulation string: {formulation}\n")
# write a table of variable and its description

fw.write(f"### variable tables: \n")
fw.write(f"| variable | description |\n")
fw.write(f"| -------- | ----------- |\n")
for variable in target_dict[factor_name]["variables"]:
fw.write(f"| {variable} | {target_dict[factor_name]['variables'][variable]} |\n")

current_index += 1
47 changes: 47 additions & 0 deletions rdagent/core/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# TODO: use pydantic for other modules in Qlib
# from pydantic_settings import BaseSettings
import os
from typing import Union

from dotenv import load_dotenv

# make sure that env variable is loaded while calling Config()
load_dotenv(verbose=True, override=True)

from pydantic_settings import BaseSettings


class FincoSettings(BaseSettings):
use_azure: bool = True
max_retry: int = 10
retry_wait_seconds: int = 1
continuous_mode: bool = False
dump_chat_cache: bool = False
use_chat_cache: bool = False
dump_embedding_cache: bool = False
use_embedding_cache: bool = False
prompt_cache_path: str = os.getcwd() + "/prompt_cache.db"
session_cache_folder_location: str = os.getcwd() + "/session_cache_folder/"
max_past_message_include: int = 10

log_llm_chat_content: bool = True

# Chat configs
chat_openai_api_key: str = ""
chat_azure_api_base: str = ""
chat_azure_api_version: str = ""
chat_model: str = ""
chat_max_tokens: int = 3000
chat_temperature: float = 0.5
chat_stream: bool = True
chat_seed: Union[int, None] = None
chat_frequency_penalty: float = 0.0
chat_presence_penalty: float = 0.0

default_system_prompt: str = "You are an AI assistant who helps to answer user's questions about finance."

# Embedding configs
embedding_openai_api_key: str = ""
embedding_azure_api_base: str = ""
embedding_azure_api_version: str = ""
embedding_model: str = ""
Loading
Loading