microsoft · peteryang1 · May 21, 2024 · May 21, 2024
diff --git a/.env.example b/.env.example
@@ -0,0 +1,27 @@
+# Global configs:
+USE_AZURE=True
+MAX_RETRY=10
+RETRY_WAIT_SECONDS=20
+DUMP_CHAT_CACHE=True
+USE_CHAT_CACHE=True
+DUMP_EMBEDDING_CACHE=True
+USE_EMBEDDING_CACHE=True
+LOG_LLM_CHAT_CONTENT=False
+CHAT_FREQUENCY_PENALTY=0.0
+CHAT_PRESENCE_PENALTY=0.0
+
+# embedding model configs:
+EMBEDDING_OPENAI_API_KEY=your_api_key
+EMBEDDING_AZURE_API_BASE=your_api_base
+EMBEDDING_AZURE_API_VERSION=your_api_version
+EMBEDDING_MODEL=text-embedding-3-small
+
+
+# chat model configs:
+CHAT_OPENAI_API_KEY=your_api_key # 5c
+CHAT_AZURE_API_BASE=your_api_base
+CHAT_AZURE_API_VERSION=your_api_version
+CHAT_MODEL=your_model_version
+CHAT_MAX_TOKENS=3000
+CHAT_TEMPERATURE=0.7
+CHAT_STREAM=True
diff --git a/rdagent/app/CI/ci.ipynb b/rdagent/app/CI/ci.ipynb
@@ -0,0 +1,24 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rdagent",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.ipynb b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.ipynb
diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py
@@ -0,0 +1,139 @@
+# %%
+from document_process.document_reader import load_and_process_pdfs_by_langchain, classify_report_from_dict
+from dotenv import load_dotenv
+from oai.llm_utils import APIBackend
+from pathlib import Path
+import json
+
+
+from document_process.document_analysis import extract_factors_from_report_dict_and_classify_result
+from document_process.document_analysis import check_factor_dict_viability
+from document_process.document_analysis import deduplicate_factors_several_times
+
+
+def extract_factors_and_implement(report_file_path: str):
+    assert load_dotenv()
+    api = APIBackend()
+    docs_dict_select = load_and_process_pdfs_by_langchain(Path(report_file_path))
+
+    selected_report_dict = classify_report_from_dict(report_dict=docs_dict_select, api=api, vote_time=1)
+    file_to_factor_result = extract_factors_from_report_dict_and_classify_result(docs_dict_select, selected_report_dict)
+
+    factor_dict = {}
+    for file_name in file_to_factor_result:
+        for factor_name in file_to_factor_result[file_name]:
+            factor_dict.setdefault(factor_name, [])
+            factor_dict[factor_name].append(file_to_factor_result[file_name][factor_name])
+
+    factor_dict_simple_deduplication = {}
+    for factor_name in factor_dict:
+        if len(factor_dict[factor_name]) > 1:
+            factor_dict_simple_deduplication[factor_name] = max(
+                factor_dict[factor_name], key=lambda x: len(x["formulation"])
+            )
+        else:
+            factor_dict_simple_deduplication[factor_name] = factor_dict[factor_name][0]
+    # %%
+
+    factor_viability = check_factor_dict_viability(factor_dict_simple_deduplication)
+    # json.dump(
+    #     factor_viability,
+    #     open(
+    #         "factor_viability_all_reports.json",
+    #         "w",
+    #     ),
+    #     indent=4,
+    # )
+
+    # factor_viability = json.load(
+    #     open(
+    #         "factor_viability_all_reports.json"
+    #     )
+    # )
+
+    # %%
+
+    duplication_names_list = deduplicate_factors_several_times(factor_dict_simple_deduplication)
+    duplication_names_list = sorted(duplication_names_list, key=lambda x: len(x), reverse=True)
+    json.dump(duplication_names_list, open("duplication_names_list.json", "w"), indent=4)
+
+    # %%
+    factor_dict_viable = {
+        factor_name: factor_dict_simple_deduplication[factor_name]
+        for factor_name in factor_dict_simple_deduplication
+        if factor_viability[factor_name]["viability"]
+    }
+
+    to_replace_dict = {}
+    for duplication_names in duplication_names_list:
+        for duplication_factor_name in duplication_names[1:]:
+            to_replace_dict[duplication_factor_name] = duplication_names[0]
+
+    added_lower_name_set = set()
+    factor_dict_deduplication_with_llm = dict()
+    for factor_name in factor_dict_simple_deduplication:
+        if factor_name not in to_replace_dict and factor_name.lower() not in added_lower_name_set:
+            added_lower_name_set.add(factor_name.lower())
+            factor_dict_deduplication_with_llm[factor_name] = factor_dict_simple_deduplication[factor_name]
+
+    to_replace_viable_dict = {}
+    for duplication_names in duplication_names_list:
+        viability_list = [factor_viability[name]["viability"] for name in duplication_names]
+        if True not in viability_list:
+            continue
+        target_factor_name = duplication_names[viability_list.index(True)]
+        for duplication_factor_name in duplication_names:
+            if duplication_factor_name == target_factor_name:
+                continue
+            to_replace_viable_dict[duplication_factor_name] = target_factor_name
+
+    added_lower_name_set = set()
+    factor_dict_deduplication_with_llm_and_viable = dict()
+    for factor_name in factor_dict_viable:
+        if factor_name not in to_replace_viable_dict and factor_name.lower() not in added_lower_name_set:
+            added_lower_name_set.add(factor_name.lower())
+            factor_dict_deduplication_with_llm_and_viable[factor_name] = factor_dict_simple_deduplication[factor_name]
+
+    # %%
+
+    dump_md_list = [
+        [factor_dict_simple_deduplication, "final_factor_book"],
+        [factor_dict_viable, "final_viable_factor_book"],
+        [factor_dict_deduplication_with_llm, "final_deduplicated_factor_book"],
+        [factor_dict_deduplication_with_llm_and_viable, "final_deduplicated_viable_factor_book"],
+    ]
+
+    for dump_md in dump_md_list:
+        factor_name_set = set()
+        current_index = 1
+        target_dict = dump_md[0]
+        json.dump(target_dict, open(f"{dump_md[1]}.json", "w"), indent=4)
+        with open(
+            rf"{dump_md[1]}.md",
+            "w",
+        ) as fw:
+            for factor_name in target_dict:
+                formulation = target_dict[factor_name]["formulation"]
+                if factor_name in formulation:
+                    target_factor_name = factor_name.replace("_", "\_")
+                    formulation = formulation.replace(factor_name, target_factor_name)
+                for variable in target_dict[factor_name]["variables"]:
+                    if variable in formulation:
+                        target_variable = variable.replace("_", "\_")
+                        formulation = formulation.replace(variable, target_variable)
+
+                fw.write(f"## {current_index}. 因子名称：{factor_name}\n")
+                fw.write(f"### Viability: {target_dict[factor_name]['viability']}\n")
+                fw.write(f"### Viability Reason: {target_dict[factor_name]['viability_reason']}\n")
+                fw.write(f"### description: {target_dict[factor_name]['description']}\n")
+                fw.write(f"### formulation: $$ {formulation} $$\n")
+                fw.write(f"### formulation string: {formulation}\n")
+                # write a table of variable and its description
+
+                fw.write(f"### variable tables: \n")
+                fw.write(f"| variable | description |\n")
+                fw.write(f"| -------- | ----------- |\n")
+                for variable in target_dict[factor_name]["variables"]:
+                    fw.write(f"| {variable} | {target_dict[factor_name]['variables'][variable]} |\n")
+
+                current_index += 1
diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py
@@ -0,0 +1,47 @@
+# TODO: use pydantic for other modules in Qlib
+# from pydantic_settings import BaseSettings
+import os
+from typing import Union
+
+from dotenv import load_dotenv
+
+# make sure that env variable is loaded while calling Config()
+load_dotenv(verbose=True, override=True)
+
+from pydantic_settings import BaseSettings
+
+
+class FincoSettings(BaseSettings):
+    use_azure: bool = True
+    max_retry: int = 10
+    retry_wait_seconds: int = 1
+    continuous_mode: bool = False
+    dump_chat_cache: bool = False
+    use_chat_cache: bool = False
+    dump_embedding_cache: bool = False
+    use_embedding_cache: bool = False
+    prompt_cache_path: str = os.getcwd() + "/prompt_cache.db"
+    session_cache_folder_location: str = os.getcwd() + "/session_cache_folder/"
+    max_past_message_include: int = 10
+
+    log_llm_chat_content: bool = True
+
+    # Chat configs
+    chat_openai_api_key: str = ""
+    chat_azure_api_base: str = ""
+    chat_azure_api_version: str = ""
+    chat_model: str = ""
+    chat_max_tokens: int = 3000
+    chat_temperature: float = 0.5
+    chat_stream: bool = True
+    chat_seed: Union[int, None] = None
+    chat_frequency_penalty: float = 0.0
+    chat_presence_penalty: float = 0.0
+
+    default_system_prompt: str = "You are an AI assistant who helps to answer user's questions about finance."
+
+    # Embedding configs
+    embedding_openai_api_key: str = ""
+    embedding_azure_api_base: str = ""
+    embedding_azure_api_version: str = ""
+    embedding_model: str = ""