diff --git a/pyproject.toml b/pyproject.toml index da4392e3..f497268d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ select = ["ALL"] "test/*" = ["S101"] [tool.setuptools] -py-modules = ["rdagent"] +packages = ["rdagent"] [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py index b11b9185..8277c36e 100644 --- a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py +++ b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py @@ -2,143 +2,29 @@ import json from pathlib import Path -from document_process.document_analysis import ( - check_factor_dict_viability, +from rdagent.document_process.document_analysis import ( + filter_factor_by_viability, deduplicate_factors_several_times, - extract_factors_from_report_dict_and_classify_result, -) -from document_process.document_reader import ( - classify_report_from_dict, - load_and_process_pdfs_by_langchain, + extract_factors_from_report_dict, + merge_file_to_factor_dict_to_factor_dict, ) +from rdagent.document_process.document_reader import load_and_process_pdfs_by_langchain +from rdagent.document_process.document_analysis import classify_report_from_dict from dotenv import load_dotenv -from oai.llm_utils import APIBackend def extract_factors_and_implement(report_file_path: str): assert load_dotenv() - api = APIBackend() - docs_dict_select = load_and_process_pdfs_by_langchain(Path(report_file_path)) - - selected_report_dict = classify_report_from_dict(report_dict=docs_dict_select, api=api, vote_time=1) - file_to_factor_result = extract_factors_from_report_dict_and_classify_result(docs_dict_select, selected_report_dict) - - factor_dict = {} - for file_name in file_to_factor_result: - for factor_name in file_to_factor_result[file_name]: - factor_dict.setdefault(factor_name, []) - factor_dict[factor_name].append(file_to_factor_result[file_name][factor_name]) - - factor_dict_simple_deduplication = {} - for factor_name in factor_dict: - if len(factor_dict[factor_name]) > 1: - factor_dict_simple_deduplication[factor_name] = max( - factor_dict[factor_name], - key=lambda x: len(x["formulation"]), - ) - else: - factor_dict_simple_deduplication[factor_name] = factor_dict[factor_name][0] - # %% - - factor_viability = check_factor_dict_viability(factor_dict_simple_deduplication) - # json.dump( - # factor_viability, - # open( - # "factor_viability_all_reports.json", - # "w", - # ), - # indent=4, - # ) - - # factor_viability = json.load( - # open( - # "factor_viability_all_reports.json" - # ) - # ) - - # %% - - duplication_names_list = deduplicate_factors_several_times(factor_dict_simple_deduplication) - duplication_names_list = sorted(duplication_names_list, key=lambda x: len(x), reverse=True) - json.dump(duplication_names_list, open("duplication_names_list.json", "w"), indent=4) - - # %% - factor_dict_viable = { - factor_name: factor_dict_simple_deduplication[factor_name] - for factor_name in factor_dict_simple_deduplication - if factor_viability[factor_name]["viability"] - } - - to_replace_dict = {} - for duplication_names in duplication_names_list: - for duplication_factor_name in duplication_names[1:]: - to_replace_dict[duplication_factor_name] = duplication_names[0] - - added_lower_name_set = set() - factor_dict_deduplication_with_llm = dict() - for factor_name in factor_dict_simple_deduplication: - if factor_name not in to_replace_dict and factor_name.lower() not in added_lower_name_set: - added_lower_name_set.add(factor_name.lower()) - factor_dict_deduplication_with_llm[factor_name] = factor_dict_simple_deduplication[factor_name] - - to_replace_viable_dict = {} - for duplication_names in duplication_names_list: - viability_list = [factor_viability[name]["viability"] for name in duplication_names] - if True not in viability_list: - continue - target_factor_name = duplication_names[viability_list.index(True)] - for duplication_factor_name in duplication_names: - if duplication_factor_name == target_factor_name: - continue - to_replace_viable_dict[duplication_factor_name] = target_factor_name - - added_lower_name_set = set() - factor_dict_deduplication_with_llm_and_viable = dict() - for factor_name in factor_dict_viable: - if factor_name not in to_replace_viable_dict and factor_name.lower() not in added_lower_name_set: - added_lower_name_set.add(factor_name.lower()) - factor_dict_deduplication_with_llm_and_viable[factor_name] = factor_dict_simple_deduplication[factor_name] - - # %% + docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path)) - dump_md_list = [ - [factor_dict_simple_deduplication, "final_factor_book"], - [factor_dict_viable, "final_viable_factor_book"], - [factor_dict_deduplication_with_llm, "final_deduplicated_factor_book"], - [factor_dict_deduplication_with_llm_and_viable, "final_deduplicated_viable_factor_book"], - ] + selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1) + file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict) + factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result) - for dump_md in dump_md_list: - factor_name_set = set() - current_index = 1 - target_dict = dump_md[0] - json.dump(target_dict, open(f"{dump_md[1]}.json", "w"), indent=4) - with open( - rf"{dump_md[1]}.md", - "w", - ) as fw: - for factor_name in target_dict: - formulation = target_dict[factor_name]["formulation"] - if factor_name in formulation: - target_factor_name = factor_name.replace("_", r"\_") - formulation = formulation.replace(factor_name, target_factor_name) - for variable in target_dict[factor_name]["variables"]: - if variable in formulation: - target_variable = variable.replace("_", r"\_") - formulation = formulation.replace(variable, target_variable) + factor_dict_viable, factor_viability = filter_factor_by_viability(factor_dict) - fw.write(f"## {current_index}. 因子名称:{factor_name}\n") - fw.write(f"### Viability: {target_dict[factor_name]['viability']}\n") - fw.write(f"### Viability Reason: {target_dict[factor_name]['viability_reason']}\n") - fw.write(f"### description: {target_dict[factor_name]['description']}\n") - fw.write(f"### formulation: $$ {formulation} $$\n") - fw.write(f"### formulation string: {formulation}\n") - # write a table of variable and its description + factor_dict, duplication_names_list = deduplicate_factors_several_times(factor_dict, factor_viability) - fw.write("### variable tables: \n") - fw.write("| variable | description |\n") - fw.write("| -------- | ----------- |\n") - for variable in target_dict[factor_name]["variables"]: - fw.write(f"| {variable} | {target_dict[factor_name]['variables'][variable]} |\n") - current_index += 1 +if __name__ == "__main__": + extract_factors_and_implement("/home/xuyang1/workspace/report.pdf") diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py index 596b9d4b..12b526b2 100644 --- a/rdagent/core/conf.py +++ b/rdagent/core/conf.py @@ -13,6 +13,7 @@ class FincoSettings(BaseSettings): use_azure: bool = True + use_azure_token_provider: bool = False max_retry: int = 10 retry_wait_seconds: int = 1 continuous_mode: bool = False diff --git a/rdagent/factor_implementation/share_modules/prompt.py b/rdagent/core/prompts.py similarity index 52% rename from rdagent/factor_implementation/share_modules/prompt.py rename to rdagent/core/prompts.py index 33cea23b..e138d953 100644 --- a/rdagent/factor_implementation/share_modules/prompt.py +++ b/rdagent/core/prompts.py @@ -2,21 +2,21 @@ from typing import Dict import yaml -from finco.utils import SingletonBaseClass +from rdagent.core.utils import SingletonBaseClass -class FactorImplementationPrompts(Dict, SingletonBaseClass): - def __init__(self): - super().__init__() - prompt_yaml_path = Path(__file__).parent / "prompts.yaml" - +class Prompts(Dict, SingletonBaseClass): + def __init__(self, file_path: Path): prompt_yaml_dict = yaml.load( open( - prompt_yaml_path, + file_path, encoding="utf8", ), Loader=yaml.FullLoader, ) + if prompt_yaml_dict is None: + raise ValueError(f"Failed to load prompts from {file_path}") + for key, value in prompt_yaml_dict.items(): self[key] = value diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py index 85628358..f0e058f6 100644 --- a/rdagent/core/utils.py +++ b/rdagent/core/utils.py @@ -14,17 +14,21 @@ from fuzzywuzzy import fuzz -class FincoException(Exception): +class RDAgentException(Exception): pass class SingletonMeta(type): - _instance = None + _instance_dict = {} def __call__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super(SingletonMeta, cls).__call__(*args, **kwargs) - return cls._instance + # Since it's hard to align the difference call using args and kwargs, we strictly ask to use kwargs in Singleton + if len(args) > 0: + raise RDAgentException("Please only use kwargs in Singleton to avoid misunderstanding.") + kwargs_hash = hash(tuple(sorted(kwargs.items()))) + if kwargs_hash not in cls._instance_dict: + cls._instance_dict[kwargs_hash] = super(SingletonMeta, cls).__call__(*args, **kwargs) + return cls._instance_dict[kwargs_hash] class SingletonBaseClass(metaclass=SingletonMeta): diff --git a/rdagent/document_process/__init__.py b/rdagent/document_process/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/rdagent/document_process/document_analysis.py b/rdagent/document_process/document_analysis.py index bb74edc1..97e85683 100644 --- a/rdagent/document_process/document_analysis.py +++ b/rdagent/document_process/document_analysis.py @@ -12,10 +12,11 @@ import yaml from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential -from core.conf import FincoSettings as Config -from core.log import FinCoLog +from rdagent.core.conf import FincoSettings as Config +from rdagent.core.log import FinCoLog +from rdagent.core.prompts import Prompts from jinja2 import Template -from oai.llm_utils import APIBackend, create_embedding_with_multiprocessing +from rdagent.oai.llm_utils import APIBackend, create_embedding_with_multiprocessing from sklearn.cluster import KMeans from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize @@ -25,10 +26,7 @@ from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader -with (Path(__file__).parent / "util_prompt.yaml").open(encoding="utf8") as f: - UTIL_PROMPT = yaml.safe_load( - f, - ) +document_process_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml") def load_documents_by_langchain(path: Path) -> list: @@ -121,7 +119,6 @@ def load_and_process_pdfs_by_azure_document_intelligence(path: Path) -> dict[str def classify_report_from_dict( report_dict: Mapping[str, str], - api: APIBackend, input_max_token: int = 128000, vote_time: int = 1, substrings: tuple[str] = (), @@ -131,7 +128,6 @@ def classify_report_from_dict( - report_dict (Dict[str, str]): A dictionary where the key is the path of the report (ending with .pdf), and the value is either the report content as a string. - - api (APIBackend): An instance of the APIBackend class. - input_max_token (int): Specifying the maximum number of input tokens. - vote_time (int): An integer specifying how many times to vote. - substrings (list(str)): List of hardcode substrings. @@ -154,7 +150,7 @@ def classify_report_from_dict( ) res_dict = {} - classify_prompt = UTIL_PROMPT["classify_system"] + classify_prompt = document_process_prompts["classify_system"] enc = tiktoken.encoding_for_model("gpt-4-turbo") for key, value in report_dict.items(): @@ -182,7 +178,7 @@ def classify_report_from_dict( for _ in range(vote_time): user_prompt = content system_prompt = classify_prompt - res = api.build_messages_and_create_chat_completion( + res = APIBackend().build_messages_and_create_chat_completion( user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True, @@ -208,7 +204,7 @@ def __extract_factors_name_and_desc_from_content( content: str, ) -> dict[str, dict[str, str]]: session = APIBackend().build_chat_session( - session_system_prompt=UTIL_PROMPT["extract_factors_system"], + session_system_prompt=document_process_prompts["extract_factors_system"], ) extracted_factor_dict = {} @@ -234,7 +230,7 @@ def __extract_factors_name_and_desc_from_content( break for factor_name, factor_description in factors.items(): extracted_factor_dict[factor_name] = factor_description - current_user_prompt = UTIL_PROMPT["extract_factors_follow_user"] + current_user_prompt = document_process_prompts["extract_factors_follow_user"] return extracted_factor_dict @@ -248,9 +244,9 @@ def __extract_factors_formulation_from_content( columns=["factor_name", "factor_description"], ) - system_prompt = UTIL_PROMPT["extract_factor_formulation_system"] + system_prompt = document_process_prompts["extract_factor_formulation_system"] current_user_prompt = Template( - UTIL_PROMPT["extract_factor_formulation_user"], + document_process_prompts["extract_factor_formulation_user"], ).render(report_content=content, factor_dict=factor_dict_df.to_string()) session = APIBackend().build_chat_session(session_system_prompt=system_prompt) @@ -288,7 +284,7 @@ def __extract_factors_formulation_from_content( return factor_to_formulation -def extract_factor_and_formulation_from_one_report( +def __extract_factor_and_formulation_from_one_report( content: str, ) -> dict[str, dict[str, str]]: final_factor_dict_to_one_report = {} @@ -299,6 +295,9 @@ def extract_factor_and_formulation_from_one_report( factor_dict, ) for factor_name in factor_dict: + if factor_name not in factor_to_formulation: + continue + final_factor_dict_to_one_report.setdefault(factor_name, {}) final_factor_dict_to_one_report[factor_name]["description"] = factor_dict[factor_name] @@ -318,7 +317,7 @@ def extract_factor_and_formulation_from_one_report( return final_factor_dict_to_one_report -def extract_factors_from_report_dict_and_classify_result( +def extract_factors_from_report_dict( report_dict: dict[str, str], useful_no_dict: dict[str, dict[str, str]], n_proc: int = 11, @@ -334,9 +333,7 @@ def extract_factors_from_report_dict_and_classify_result( final_report_factor_dict = {} # for file_name, content in useful_report_dict.items(): # final_report_factor_dict.setdefault(file_name, {}) - # final_report_factor_dict[ - # file_name - # ] = extract_factor_and_formulation_from_one_report(content) + # final_report_factor_dict[file_name] = __extract_factor_and_formulation_from_one_report(content) while len(final_report_factor_dict) != len(useful_report_dict): pool = mp.Pool(n_proc) @@ -348,7 +345,7 @@ def extract_factors_from_report_dict_and_classify_result( file_names.append(file_name) pool_result_list.append( pool.apply_async( - extract_factor_and_formulation_from_one_report, + __extract_factor_and_formulation_from_one_report, (content,), ), ) @@ -366,11 +363,32 @@ def extract_factors_from_report_dict_and_classify_result( return final_report_factor_dict -def check_factor_dict_viability_simulate_json_mode( +def merge_file_to_factor_dict_to_factor_dict( + file_to_factor_dict: dict[str, dict], +) -> dict: + factor_dict = {} + for file_name in file_to_factor_dict: + for factor_name in file_to_factor_dict[file_name]: + factor_dict.setdefault(factor_name, []) + factor_dict[factor_name].append(file_to_factor_dict[file_name][factor_name]) + + factor_dict_simple_deduplication = {} + for factor_name in factor_dict: + if len(factor_dict[factor_name]) > 1: + factor_dict_simple_deduplication[factor_name] = max( + factor_dict[factor_name], + key=lambda x: len(x["formulation"]), + ) + else: + factor_dict_simple_deduplication[factor_name] = factor_dict[factor_name][0] + return factor_dict_simple_deduplication + + +def __check_factor_dict_viability_simulate_json_mode( factor_df_string: str, ) -> dict[str, dict[str, str]]: session = APIBackend().build_chat_session( - session_system_prompt=UTIL_PROMPT["factor_viability_system"], + session_system_prompt=document_process_prompts["factor_viability_system"], ) current_user_prompt = factor_df_string @@ -393,9 +411,9 @@ def check_factor_dict_viability_simulate_json_mode( return {} -def check_factor_dict_viability( +def filter_factor_by_viability( factor_dict: dict[str, dict[str, str]], -) -> dict[str, dict[str, str]]: +) -> tuple[dict[str, dict[str, str]], dict[str, dict[str, str]]]: factor_viability_dict = {} factor_df = pd.DataFrame(factor_dict).T @@ -410,7 +428,7 @@ def check_factor_dict_viability( result_list.append( pool.apply_async( - check_factor_dict_viability_simulate_json_mode, + __check_factor_dict_viability_simulate_json_mode, (target_factor_df_string,), ), ) @@ -425,14 +443,20 @@ def check_factor_dict_viability( factor_df = factor_df[~factor_df.index.isin(factor_viability_dict)] - return factor_viability_dict + filtered_factor_dict = { + factor_name: factor_dict[factor_name] + for factor_name in factor_dict + if factor_viability_dict[factor_name]["viability"] + } + + return filtered_factor_dict, factor_viability_dict def check_factor_duplication_simulate_json_mode( factor_df: pd.DataFrame, ) -> list[list[str]]: session = APIBackend().build_chat_session( - session_system_prompt=UTIL_PROMPT["factor_duplicate_system"], + session_system_prompt=document_process_prompts["factor_duplicate_system"], ) current_user_prompt = factor_df.to_string() @@ -588,6 +612,7 @@ def deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[list def deduplicate_factors_several_times( factor_dict: dict[str, dict[str, str]], + factor_viability_dict: dict[str, dict[str, str]] = None, ) -> list[list[str]]: final_duplication_names_list = [] current_round_factor_dict = factor_dict @@ -604,5 +629,31 @@ def deduplicate_factors_several_times( if len(new_round_names) != 0: current_round_factor_dict = {factor_name: factor_dict[factor_name] for factor_name in new_round_names} else: - return final_duplication_names_list - return [] + break + + final_duplication_names_list = sorted(final_duplication_names_list, key=lambda x: len(x), reverse=True) + + to_replace_dict = {} + for duplication_names in duplication_names_list: + if factor_viability_dict is not None: + viability_list = [factor_viability_dict[name]["viability"] for name in duplication_names] + if True not in viability_list: + continue + target_factor_name = duplication_names[viability_list.index(True)] + else: + target_factor_name = duplication_names[0] + for duplication_factor_name in duplication_names: + if duplication_factor_name == target_factor_name: + continue + to_replace_dict[duplication_factor_name] = target_factor_name + + llm_deduplicated_factor_dict = dict() + added_lower_name_set = set() + for factor_name in factor_dict: + if factor_name not in to_replace_dict and factor_name.lower() not in added_lower_name_set: + if factor_viability_dict is not None and not factor_viability_dict[factor_name]["viability"]: + continue + added_lower_name_set.add(factor_name.lower()) + llm_deduplicated_factor_dict[factor_name] = factor_dict[factor_name] + + return llm_deduplicated_factor_dict, final_duplication_names_list diff --git a/rdagent/document_process/document_reader.py b/rdagent/document_process/document_reader.py index 6a93eb1c..2e8ad630 100644 --- a/rdagent/document_process/document_reader.py +++ b/rdagent/document_process/document_reader.py @@ -5,18 +5,12 @@ import yaml from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential -from finco.conf import FincoSettings as Config - -if TYPE_CHECKING: - from langchain_core.documents import Document +from rdagent.core.conf import FincoSettings as Config +from rdagent.core.prompts import Prompts +from langchain_core.documents import Document from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader -with (Path(__file__).parent / "util_prompt.yaml").open(encoding="utf8") as f: - UTIL_PROMPT = yaml.safe_load( - f, - ) - def load_documents_by_langchain(path: Path) -> list: """Load documents from the specified path. diff --git a/rdagent/document_process/prompts.yaml b/rdagent/document_process/prompts.yaml new file mode 100644 index 00000000..f08f0048 --- /dev/null +++ b/rdagent/document_process/prompts.yaml @@ -0,0 +1,182 @@ +extract_factors_system: |- + 用户会提供一篇金融工程研报,其中包括了量化因子和模型研究,请按照要求抽取以下信息: + 1. 概述这篇研报的主要研究思路; + 2. 抽取出所有的因子,并概述因子的计算过程,请注意有些因子可能存在于表格中,请不要遗漏,因子的名称请使用英文,不能包含空格,可用下划线连接,研报中可能不含有因子,若没有请返回空字典; + 3. 抽取研报里面的所有模型,并概述模型的计算过程,可以分步骤描述模型搭建或计算的过程,研报中可能不含有模型,若没有请返回空字典; + + user will treat your factor name as key to store the factor, don't put any interaction message in the content. Just response the output without any interaction and explanation. + All names should be in English. + Respond with your analysis in JSON format. The JSON schema should include: + ```json + { + "summary": "The summary of this report", + "factors": { + "Name of factor 1": "Description to factor 1", + "Name of factor 2": "Description to factor 2" + }, + "models": { + "Name of model 1": "Description to model 1", + "Name of model 2": "Description to model 2" + } + } + ``` + +extract_factors_follow_user: |- + Please continue extracting the factors. Please ignore factors appeared in former messages. If no factor is found, please return an empty dict. + Notice: You should not miss any factor in the report! Some factors might appear several times in the report. You can repeat them to avoid missing other factors. + Respond with your analysis in JSON format. The JSON schema should include: + ```json + { + "factors": { + "Name of factor 1": "Description to factor 1", + "Name of factor 2": "Description to factor 2" + } + } + ``` + +extract_factor_formulation_system: |- + 用户会提供一篇金融工程研报,和用户从中提取到的因子列表,请结合文章和用户提供的因子名称和因子描述,按照要求抽取: + 1. 因子的计算公式,使用latex格式,公式中的变量名称不能包含空格,可用下划线连接,公式中的因子名称与用户提供的因子名称保持一致; + 2. 因子公式中的变量和函数解释,请使用英文描述,变量名和函数名请与公式中的名称对齐 + + User has several source data: + 1. The Stock Trade Data Table containing information about stock trades, such as daily open, close, high, low, vwap prices, volume, and turnover; + 2. The Financial Data Table containing company financial statements such as the balance sheet, income statement, and cash flow statement; + 3. The Stock Fundamental Data Table containing basic information about stocks, like total shares outstanding, free float shares, industry classification, market classification, etc; + 4. The high frequency data containing price and volume of each stock containing open close high low volume vwap in each minute. + Please try to expand the formulation to using the source data provided by user. + + user will treat your factor name as key to store the factor, don't put any interaction message in the content. Just response the output without any interaction and explanation. + You can extract part of the user's input factors if token is not enough. To avoid the situation that you don't respond in the valid format, don't extract more than thirty factors in one response. + Be caution of the "\" in your formulation because In JSON, certain characters like the backslash need to be escaped with another backslash. Especially, _ and \_ are different in latex so use \_ to represent _ in latex. + Respond with your analysis in JSON format. The JSON schema should include: + ```json + { + "name of factor 1": { + "formulation": "latex formulation of factor 1", + "variables": { + "Name to variable or function 1": "Description to variable or function 1", + "Name to variable or function 2": "Description to variable or function 2" + } + }, + "name of factor 2": { + "formulation": "latex formulation of factor 2", + "variables": { + "Name to variable or function 1": "Description to variable or function 1", + "Name to variable or function 2": "Description to variable or function 2" + } + } + } + ``` + +extract_factor_formulation_user: |- + ===========================Report content:============================= + {{ report_content }} + ===========================Factor list in dataframe============================= + {{ factor_dict }} + +classify_system: |- + 你是一个研报分类助手。用户会输入一篇金融研报。请按照要求回答: + 因子指能够解释资产收益率或价格等的变量;而模型则指机器学习或深度学习模型,利用因子等变量来预测价格或收益率变化。 + + 请你对研报进行分类,考虑两个条件: + 1. 是金工量化领域中选股(需与择时,选基等严格区分开)方面的研报; + 2. 涉及了因子或模型的构成,或者是测试了它们的表现。 + 如果研报同时满足上述两个条件,请输出1;若没有,请输出0。 + + 请使用json进行回答。json key为:class + +factor_viability_system: |- + User has designed several factors in quant investment. Please help the user to check the viability of these factors. + These factors are used to build a daily frequency strategy in China A-share market. + + User will provide a pandas dataframe like table containing following information: + 1. The name of the factor; + 2. The simple description of the factor; + 3. The formulation of the factor in latex format; + 4. The description to the variables and functions in the formulation of the factor. + + User has several source data: + 1. The Stock Trade Data Table containing information about stock trades, such as daily open, close, high, low, vwap prices, volume, and turnover; + 2. The Financial Data Table containing company financial statements such as the balance sheet, income statement, and cash flow statement; + 3. The Stock Fundamental Data Table containing basic information about stocks, like total shares outstanding, free float shares, industry classification, market classification, etc; + 4. The high frequency data containing price and volume of each stock containing open close high low volume vwap in each minute; + 5. The Consensus Expectations Factor containing the consensus expectations of the analysts about the future performance of the company. + + + A viable factor should satisfy the following conditions: + 1. The factor should be able to be calculated in daily frequency; + 2. The factor should be able to be calculated based on each stock; + 3. The factor should be able to be calculated based on the source data provided by user. + + You should give decision to each factor provided by the user. You should reject the factor based on very solid reason. + Please return true to the viable factor and false to the non-viable factor. + + Notice, you can just return part of the factors due to token limit. Your factor name should be the same as the user's factor name. + + Please respond with your decision in JSON format. Just respond the output json string without any interaction and explanation. + The JSON schema should include: + ```json + { + "Name to factor 1": + { + "viability": true, + "reason": "The reason to the viability of this factor" + }, + "Name to factor 2": + { + "viability": false, + "reason": "The reason to the non-viability of this factor" + } + "Name to factor 3": + { + "viability": true, + "reason": "The reason to the viability of this factor" + } + } + ``` + + +factor_duplicate_system: |- + User has designed several factors in quant investment. Please help the user to duplicate these factors. + These factors are used to build a daily frequency strategy in China A-share market. + + User will provide a pandas dataframe like table containing following information: + 1. The name of the factor; + 2. The simple description of the factor; + 3. The formulation of the factor in latex format; + 4. The description to the variables and functions in the formulation of the factor. + + User wants to find whether there are duplicated groups. The factors in a duplicate group should satisfy the following conditions: + 1. They might differ in the name, description, formulation, or the description to the variables and functions in the formulation, some upper or lower case difference is included; + 2. They should be talking about exactly the same factor; + 3. If horizon information like 1 day, 5 days, 10 days, etc is provided, the horizon information should be the same. + + To make your response valid, we have some very important constraint for you to follow! Listed here: + 1. You should be very confident to put duplicated factors into a group; + 2. A group should contain at least two factors; + 3. To a factor which has no duplication, don't put them into your response; + 4. To avoid merging too many similar factor, don't put more than ten factors into a group! + You should always follow the above constraints to make your response valid. + + Your response JSON schema should include: + ```json + [ + [ + "factor name 1", + "factor name 2" + ], + [ + "factor name 5", + "factor name 6" + ], + [ + "factor name 7", + "factor name 8", + "factor name 9" + ] + ] + ``` + Your response is a list of lists. Each list represents a duplicate group containing all the factor names in this group. + The factor names in the list should be unique and the factor names should be the same as the user's factor name. + To avoid reaching token limit, don't respond more than fifty groups in one response. You should respond the output json string without any interaction and explanation. \ No newline at end of file diff --git a/rdagent/factor_implementation/evolving/evaluators.py b/rdagent/factor_implementation/evolving/evaluators.py index e7e17bf9..930a9fee 100644 --- a/rdagent/factor_implementation/evolving/evaluators.py +++ b/rdagent/factor_implementation/evolving/evaluators.py @@ -3,20 +3,20 @@ import re from typing import List -from core.evolving_framework import Evaluator as EvolvingEvaluator -from core.evolving_framework import Feedback, QueriedKnowledge -from core.log import FinCoLog -from core.utils import multiprocessing_wrapper -from factor_implementation.evolving.evolvable_subjects import FactorImplementationList -from factor_implementation.share_modules.evaluator import ( +from rdagent.core.evolving_framework import Evaluator as EvolvingEvaluator +from rdagent.core.evolving_framework import Feedback, QueriedKnowledge +from rdagent.core.log import FinCoLog +from rdagent.core.utils import multiprocessing_wrapper +from rdagent.factor_implementation.evolving.evolvable_subjects import FactorImplementationList +from rdagent.factor_implementation.share_modules.evaluator import ( Evaluator as FactorImplementationEvaluator, ) -from factor_implementation.share_modules.evaluator import ( +from rdagent.factor_implementation.share_modules.evaluator import ( FactorImplementationCodeEvaluator, FactorImplementationFinalDecisionEvaluator, FactorImplementationValueEvaluator, ) -from factor_implementation.share_modules.factor import ( +from rdagent.factor_implementation.share_modules.factor import ( FactorImplementation, FactorImplementationTask, ) diff --git a/rdagent/factor_implementation/evolving/evolvable_subjects.py b/rdagent/factor_implementation/evolving/evolvable_subjects.py index 43e91c0e..5c5ab22d 100644 --- a/rdagent/factor_implementation/evolving/evolvable_subjects.py +++ b/rdagent/factor_implementation/evolving/evolvable_subjects.py @@ -1,8 +1,8 @@ from __future__ import annotations -from core.evolving_framework import EvolvableSubjects -from core.log import FinCoLog -from factor_implementation.share_modules.factor import ( +from rdagent.core.evolving_framework import EvolvableSubjects +from rdagent.core.log import FinCoLog +from rdagent.factor_implementation.share_modules.factor import ( FactorImplementation, FactorImplementationTask, ) diff --git a/rdagent/factor_implementation/evolving/evolving_strategy.py b/rdagent/factor_implementation/evolving/evolving_strategy.py index 1f2cf4b2..7a1c19ed 100644 --- a/rdagent/factor_implementation/evolving/evolving_strategy.py +++ b/rdagent/factor_implementation/evolving/evolving_strategy.py @@ -1,21 +1,22 @@ from __future__ import annotations import json +from pathlib import Path import random from abc import abstractmethod from copy import deepcopy from typing import TYPE_CHECKING -from core.evolving_framework import EvolvingStrategy, QueriedKnowledge -from core.utils import multiprocessing_wrapper -from factor_implementation.share_modules.factor import ( +from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge +from rdagent.core.utils import multiprocessing_wrapper +from rdagent.factor_implementation.share_modules.factor import ( FactorImplementation, FactorImplementationTask, FileBasedFactorImplementation, ) -from factor_implementation.share_modules.prompt import FactorImplementationPrompts +from rdagent.core.prompts import Prompts from jinja2 import Template -from oai.llm_utils import APIBackend +from rdagent.oai.llm_utils import APIBackend from rdagent.factor_implementation.share_modules.factor_implementation_config import ( FactorImplementSettings, @@ -117,7 +118,9 @@ def implement_one_factor( queried_former_failed_knowledge_to_render = queried_former_failed_knowledge system_prompt = Template( - FactorImplementationPrompts()["evolving_strategy_factor_implementation_v1_system"], + Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")[ + "evolving_strategy_factor_implementation_v1_system" + ], ).render( data_info=get_data_folder_intro(), queried_former_failed_knowledge=queried_former_failed_knowledge_to_render, @@ -130,7 +133,9 @@ def implement_one_factor( while True: user_prompt = ( Template( - FactorImplementationPrompts()["evolving_strategy_factor_implementation_v1_user"], + Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")[ + "evolving_strategy_factor_implementation_v1_user" + ], ) .render( factor_information_str=factor_information_str, @@ -204,7 +209,9 @@ def implement_one_factor( queried_former_failed_knowledge_to_render = queried_former_failed_knowledge system_prompt = Template( - FactorImplementationPrompts()["evolving_strategy_factor_implementation_v1_system"], + Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")[ + "evolving_strategy_factor_implementation_v1_system" + ], ).render( data_info=get_data_folder_intro(), queried_former_failed_knowledge=queried_former_failed_knowledge_to_render, @@ -224,7 +231,11 @@ def implement_one_factor( and len(queried_former_failed_knowledge_to_render) != 0 ): error_summary_system_prompt = ( - Template(FactorImplementationPrompts()["evolving_strategy_error_summary_v2_system"]) + Template( + Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")[ + "evolving_strategy_error_summary_v2_system" + ] + ) .render( factor_information_str=target_factor_task_information, code_and_feedback=queried_former_failed_knowledge_to_render[ @@ -238,7 +249,11 @@ def implement_one_factor( ) while True: error_summary_user_prompt = ( - Template(FactorImplementationPrompts()["evolving_strategy_error_summary_v2_user"]) + Template( + Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")[ + "evolving_strategy_error_summary_v2_user" + ] + ) .render( queried_similar_component_knowledge=queried_similar_component_knowledge_to_render, ) @@ -258,7 +273,9 @@ def implement_one_factor( user_prompt = ( Template( - FactorImplementationPrompts()["evolving_strategy_factor_implementation_v2_user"], + Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")[ + "evolving_strategy_factor_implementation_v2_user" + ], ) .render( factor_information_str=target_factor_task_information, diff --git a/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py b/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py index a8946e87..2b410eab 100644 --- a/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py +++ b/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py @@ -4,24 +4,24 @@ from pathlib import Path import pandas as pd -from core.evolving_framework import EvoAgent, KnowledgeBase -from core.utils import multiprocessing_wrapper -from factor_implementation.evolving.evaluators import ( +from rdagent.core.evolving_framework import EvoAgent, KnowledgeBase +from rdagent.core.utils import multiprocessing_wrapper +from rdagent.factor_implementation.evolving.evaluators import ( FactorImplementationEvaluatorV1, FactorImplementationsMultiEvaluator, ) -from factor_implementation.evolving.evolvable_subjects import FactorImplementationList -from factor_implementation.evolving.evolving_strategy import ( +from rdagent.factor_implementation.evolving.evolvable_subjects import FactorImplementationList +from rdagent.factor_implementation.evolving.evolving_strategy import ( FactorEvolvingStrategy, FactorEvolvingStrategyWithGraph, ) -from factor_implementation.evolving.knowledge_management import ( +from rdagent.factor_implementation.evolving.knowledge_management import ( FactorImplementationGraphKnowledgeBase, FactorImplementationGraphRAGStrategy, FactorImplementationKnowledgeBaseV1, FactorImplementationRAGStrategyV1, ) -from factor_implementation.share_modules.factor import ( +from rdagent.factor_implementation.share_modules.factor import ( FactorImplementationTask, FileBasedFactorImplementation, ) diff --git a/rdagent/factor_implementation/evolving/knowledge_management.py b/rdagent/factor_implementation/evolving/knowledge_management.py index b20e7920..d79f5a81 100644 --- a/rdagent/factor_implementation/evolving/knowledge_management.py +++ b/rdagent/factor_implementation/evolving/knowledge_management.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Union -from core.evolving_framework import ( +from rdagent.core.evolving_framework import ( EvolvableSubjects, EvoStep, Knowledge, @@ -16,16 +16,16 @@ QueriedKnowledge, RAGStrategy, ) -from core.log import FinCoLog -from factor_implementation.evolving.evaluators import FactorImplementationSingleFeedback -from factor_implementation.share_modules.factor import ( +from rdagent.core.log import FinCoLog +from rdagent.factor_implementation.evolving.evaluators import FactorImplementationSingleFeedback +from rdagent.factor_implementation.share_modules.factor import ( FactorImplementation, FactorImplementationTask, ) -from factor_implementation.share_modules.prompt import FactorImplementationPrompts -from finco.graph import UndirectedGraph, UndirectedNode +from rdagent.core.prompts import Prompts +from rdagent.knowledge_management.graph import UndirectedGraph, UndirectedNode from jinja2 import Template -from oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list +from rdagent.oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list from rdagent.factor_implementation.share_modules.factor_implementation_config import ( FactorImplementSettings, @@ -145,9 +145,9 @@ def query( for target_factor_task in evo.target_factor_tasks: target_factor_task_information = target_factor_task.get_factor_information() if target_factor_task_information in self.knowledgebase.success_task_info_set: - queried_knowledge.success_task_to_knowledge_dict[ - target_factor_task_information - ] = self.knowledgebase.implementation_trace[target_factor_task_information][-1] + queried_knowledge.success_task_to_knowledge_dict[target_factor_task_information] = ( + self.knowledgebase.implementation_trace[target_factor_task_information][-1] + ) elif ( len( self.knowledgebase.implementation_trace.setdefault( @@ -159,14 +159,12 @@ def query( ): queried_knowledge.failed_task_info_set.add(target_factor_task_information) else: - queried_knowledge.working_task_to_former_failed_knowledge_dict[ - target_factor_task_information - ] = self.knowledgebase.implementation_trace.setdefault( - target_factor_task_information, - [], - )[ - -v1_query_former_trace_limit: - ] + queried_knowledge.working_task_to_former_failed_knowledge_dict[target_factor_task_information] = ( + self.knowledgebase.implementation_trace.setdefault( + target_factor_task_information, + [], + )[-v1_query_former_trace_limit:] + ) knowledge_base_success_task_list = list( self.knowledgebase.success_task_info_set, @@ -187,9 +185,9 @@ def query( )[-1] for index in similar_indexes ] - queried_knowledge.working_task_to_similar_successful_knowledge_dict[ - target_factor_task_information - ] = similar_successful_knowledge + queried_knowledge.working_task_to_similar_successful_knowledge_dict[target_factor_task_information] = ( + similar_successful_knowledge + ) return queried_knowledge @@ -212,7 +210,7 @@ class FactorImplementationGraphRAGStrategy(RAGStrategy): def __init__(self, knowledgebase: FactorImplementationGraphKnowledgeBase) -> None: super().__init__(knowledgebase) self.current_generated_trace_count = 0 - self.prompt = FactorImplementationPrompts() + self.prompt = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml") def generate_knowledge( self, @@ -417,9 +415,9 @@ def former_trace_query( else: current_index += 1 - factor_implementation_queried_graph_knowledge.former_traces[ - target_factor_task_information - ] = former_trace_knowledge[-v2_query_former_trace_limit:] + factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = ( + former_trace_knowledge[-v2_query_former_trace_limit:] + ) else: factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = [] diff --git a/rdagent/factor_implementation/share_modules/prompts.yaml b/rdagent/factor_implementation/prompts.yaml similarity index 100% rename from rdagent/factor_implementation/share_modules/prompts.yaml rename to rdagent/factor_implementation/prompts.yaml diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py index 8e0c69e9..71d7d859 100644 --- a/rdagent/oai/llm_utils.py +++ b/rdagent/oai/llm_utils.py @@ -31,6 +31,11 @@ def md5_hash(input_string): return hashed_string +try: + from azure.identity import DefaultAzureCredential, get_bearer_token_provider +except ImportError: + FinCoLog().warning("azure.identity is not installed.") + try: import openai except ImportError: @@ -285,6 +290,7 @@ def __init__( self.encoder = None else: self.use_azure = self.cfg.use_azure + self.use_azure_token_provider = self.cfg.use_azure_token_provider self.chat_api_key = self.cfg.chat_openai_api_key if chat_api_key is None else chat_api_key self.chat_model = self.cfg.chat_model if chat_model is None else chat_model @@ -306,16 +312,32 @@ def __init__( ) if self.use_azure: - self.chat_client = openai.AzureOpenAI( - api_key=self.chat_api_key, - api_version=self.chat_api_version, - azure_endpoint=self.chat_api_base, - ) - self.embedding_client = openai.AzureOpenAI( - api_key=self.embedding_api_key, - api_version=self.embedding_api_version, - azure_endpoint=self.embedding_api_base, - ) + if self.use_azure_token_provider: + credential = DefaultAzureCredential() + token_provider = get_bearer_token_provider( + credential, "https://cognitiveservices.azure.com/.default" + ) + self.chat_client = openai.AzureOpenAI( + azure_ad_token_provider=token_provider, + api_version=self.chat_api_version, + azure_endpoint=self.chat_api_base, + ) + self.embedding_client = openai.AzureOpenAI( + azure_ad_token_provider=token_provider, + api_version=self.embedding_api_version, + azure_endpoint=self.embedding_api_base, + ) + else: + self.chat_client = openai.AzureOpenAI( + api_key=self.chat_api_key, + api_version=self.chat_api_version, + azure_endpoint=self.chat_api_base, + ) + self.embedding_client = openai.AzureOpenAI( + api_key=self.embedding_api_key, + api_version=self.embedding_api_version, + azure_endpoint=self.embedding_api_base, + ) else: self.chat_client = openai.OpenAI(api_key=self.chat_api_key) self.embedding_client = openai.OpenAI(api_key=self.embedding_api_key) @@ -328,7 +350,7 @@ def __init__( self.use_embedding_cache = self.cfg.use_embedding_cache if use_embedding_cache is None else use_embedding_cache if self.dump_chat_cache or self.use_chat_cache or self.dump_embedding_cache or self.use_embedding_cache: self.cache_file_location = self.cfg.prompt_cache_path - self.cache = SQliteLazyCache(self.cache_file_location) + self.cache = SQliteLazyCache(cache_location=self.cache_file_location) # transfer the config to the class if the config is not supposed to change during the runtime self.use_llama2 = self.cfg.use_llama2 diff --git a/requirements.txt b/requirements.txt index 9b68b3cc..13c937bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,6 @@ azure-ai-formrecognizer # factor implementations tables + +# azure identity related +azure.identity \ No newline at end of file diff --git a/test/oai/test_embedding.py b/test/oai/test_embedding.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/oai/test_embedding_and_similarity.py b/test/oai/test_embedding_and_similarity.py new file mode 100644 index 00000000..8e426d4d --- /dev/null +++ b/test/oai/test_embedding_and_similarity.py @@ -0,0 +1,25 @@ +import pickle +import unittest +from pathlib import Path +import json +import random + +from rdagent.oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list + + +class TestEmbedding(unittest.TestCase): + def test_embedding(self): + emb = APIBackend().create_embedding("hello") + assert emb is not None + assert type(emb) == list + assert len(emb) > 0 + + def test_embedding_similarity(self): + similarity = calculate_embedding_distance_between_str_list(["Hello"], ["Hi"])[0][0] + assert similarity is not None + assert type(similarity) == float + assert similarity >= 0.8 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/oai/test_embedding_similarity.py b/test/oai/test_embedding_similarity.py deleted file mode 100644 index e69de29b..00000000