From f4cace620f80e67684b54f5f49ec7644ecf7ada7 Mon Sep 17 00:00:00 2001 From: xuyang1 Date: Tue, 21 May 2024 14:47:11 +0000 Subject: [PATCH] update code --- .env.example | 27 + rdagent/app/CI/ci.ipynb | 24 + .../factor_extract_and_implement.ipynb | 0 .../factor_extract_and_implement.py | 139 +++ rdagent/core/conf.py | 47 + rdagent/core/evolving_framework.py | 183 ++++ rdagent/core/log.py | 155 +++ rdagent/core/utils.py | 165 ++++ .../__init__.py | 0 rdagent/document_process/document_analysis.py | 615 ++++++++++++ rdagent/document_process/document_reader.py | 109 +++ .../evolving}/__init__.py | 0 .../evolving/evaluators.py | 231 +++++ .../evolving/evolvable_subjects.py | 34 + .../evolving/evolving_strategy.py | 298 ++++++ .../factor_implementation_evolving_cli.py | 311 ++++++ .../evolving/knowledge_management.py | 905 ++++++++++++++++++ .../share_modules/conf.py | 42 + .../share_modules/evaluator.py | 535 +++++++++++ .../share_modules/exception.py | 26 + .../share_modules/factor.py | 221 +++++ .../share_modules/factor_gen.py | 31 + .../share_modules/prompt.py | 23 + .../share_modules/prompts.yaml | 196 ++++ .../share_modules/utils.py | 51 + rdagent/knowledge_management/graph.py | 490 ++++++++++ rdagent/oai/llm_utils.py | 706 ++++++++++++++ 27 files changed, 5564 insertions(+) create mode 100644 .env.example delete mode 100644 rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.ipynb create mode 100644 rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py create mode 100644 rdagent/core/log.py create mode 100644 rdagent/core/utils.py rename rdagent/{data_driven_code_generation => document_process}/__init__.py (100%) create mode 100644 rdagent/document_process/document_analysis.py create mode 100644 rdagent/document_process/document_reader.py rename rdagent/{document_analysis => factor_implementation/evolving}/__init__.py (100%) create mode 100644 rdagent/factor_implementation/evolving/evaluators.py create mode 100644 rdagent/factor_implementation/evolving/evolvable_subjects.py create mode 100644 rdagent/factor_implementation/evolving/evolving_strategy.py create mode 100644 rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py create mode 100644 rdagent/factor_implementation/evolving/knowledge_management.py create mode 100644 rdagent/factor_implementation/share_modules/conf.py create mode 100644 rdagent/factor_implementation/share_modules/evaluator.py create mode 100644 rdagent/factor_implementation/share_modules/exception.py create mode 100644 rdagent/factor_implementation/share_modules/factor.py create mode 100644 rdagent/factor_implementation/share_modules/factor_gen.py create mode 100644 rdagent/factor_implementation/share_modules/prompt.py create mode 100644 rdagent/factor_implementation/share_modules/prompts.yaml create mode 100644 rdagent/factor_implementation/share_modules/utils.py create mode 100644 rdagent/knowledge_management/graph.py diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..b7244697 --- /dev/null +++ b/.env.example @@ -0,0 +1,27 @@ +# Global configs: +USE_AZURE=True +MAX_RETRY=10 +RETRY_WAIT_SECONDS=20 +DUMP_CHAT_CACHE=True +USE_CHAT_CACHE=True +DUMP_EMBEDDING_CACHE=True +USE_EMBEDDING_CACHE=True +LOG_LLM_CHAT_CONTENT=False +CHAT_FREQUENCY_PENALTY=0.0 +CHAT_PRESENCE_PENALTY=0.0 + +# embedding model configs: +EMBEDDING_OPENAI_API_KEY=your_api_key +EMBEDDING_AZURE_API_BASE=your_api_base +EMBEDDING_AZURE_API_VERSION=your_api_version +EMBEDDING_MODEL=text-embedding-3-small + + +# chat model configs: +CHAT_OPENAI_API_KEY=your_api_key # 5c +CHAT_AZURE_API_BASE=your_api_base +CHAT_AZURE_API_VERSION=your_api_version +CHAT_MODEL=your_model_version +CHAT_MAX_TOKENS=3000 +CHAT_TEMPERATURE=0.7 +CHAT_STREAM=True diff --git a/rdagent/app/CI/ci.ipynb b/rdagent/app/CI/ci.ipynb index e69de29b..00d4b329 100644 --- a/rdagent/app/CI/ci.ipynb +++ b/rdagent/app/CI/ci.ipynb @@ -0,0 +1,24 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rdagent", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.ipynb b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.ipynb deleted file mode 100644 index e69de29b..00000000 diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py new file mode 100644 index 00000000..aa99ca46 --- /dev/null +++ b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py @@ -0,0 +1,139 @@ +# %% +from document_process.document_reader import load_and_process_pdfs_by_langchain, classify_report_from_dict +from dotenv import load_dotenv +from oai.llm_utils import APIBackend +from pathlib import Path +import json + + +from document_process.document_analysis import extract_factors_from_report_dict_and_classify_result +from document_process.document_analysis import check_factor_dict_viability +from document_process.document_analysis import deduplicate_factors_several_times + + +def extract_factors_and_implement(report_file_path: str): + assert load_dotenv() + api = APIBackend() + docs_dict_select = load_and_process_pdfs_by_langchain(Path(report_file_path)) + + selected_report_dict = classify_report_from_dict(report_dict=docs_dict_select, api=api, vote_time=1) + file_to_factor_result = extract_factors_from_report_dict_and_classify_result(docs_dict_select, selected_report_dict) + + factor_dict = {} + for file_name in file_to_factor_result: + for factor_name in file_to_factor_result[file_name]: + factor_dict.setdefault(factor_name, []) + factor_dict[factor_name].append(file_to_factor_result[file_name][factor_name]) + + factor_dict_simple_deduplication = {} + for factor_name in factor_dict: + if len(factor_dict[factor_name]) > 1: + factor_dict_simple_deduplication[factor_name] = max( + factor_dict[factor_name], key=lambda x: len(x["formulation"]) + ) + else: + factor_dict_simple_deduplication[factor_name] = factor_dict[factor_name][0] + # %% + + factor_viability = check_factor_dict_viability(factor_dict_simple_deduplication) + # json.dump( + # factor_viability, + # open( + # "factor_viability_all_reports.json", + # "w", + # ), + # indent=4, + # ) + + # factor_viability = json.load( + # open( + # "factor_viability_all_reports.json" + # ) + # ) + + # %% + + duplication_names_list = deduplicate_factors_several_times(factor_dict_simple_deduplication) + duplication_names_list = sorted(duplication_names_list, key=lambda x: len(x), reverse=True) + json.dump(duplication_names_list, open("duplication_names_list.json", "w"), indent=4) + + # %% + factor_dict_viable = { + factor_name: factor_dict_simple_deduplication[factor_name] + for factor_name in factor_dict_simple_deduplication + if factor_viability[factor_name]["viability"] + } + + to_replace_dict = {} + for duplication_names in duplication_names_list: + for duplication_factor_name in duplication_names[1:]: + to_replace_dict[duplication_factor_name] = duplication_names[0] + + added_lower_name_set = set() + factor_dict_deduplication_with_llm = dict() + for factor_name in factor_dict_simple_deduplication: + if factor_name not in to_replace_dict and factor_name.lower() not in added_lower_name_set: + added_lower_name_set.add(factor_name.lower()) + factor_dict_deduplication_with_llm[factor_name] = factor_dict_simple_deduplication[factor_name] + + to_replace_viable_dict = {} + for duplication_names in duplication_names_list: + viability_list = [factor_viability[name]["viability"] for name in duplication_names] + if True not in viability_list: + continue + target_factor_name = duplication_names[viability_list.index(True)] + for duplication_factor_name in duplication_names: + if duplication_factor_name == target_factor_name: + continue + to_replace_viable_dict[duplication_factor_name] = target_factor_name + + added_lower_name_set = set() + factor_dict_deduplication_with_llm_and_viable = dict() + for factor_name in factor_dict_viable: + if factor_name not in to_replace_viable_dict and factor_name.lower() not in added_lower_name_set: + added_lower_name_set.add(factor_name.lower()) + factor_dict_deduplication_with_llm_and_viable[factor_name] = factor_dict_simple_deduplication[factor_name] + + # %% + + dump_md_list = [ + [factor_dict_simple_deduplication, "final_factor_book"], + [factor_dict_viable, "final_viable_factor_book"], + [factor_dict_deduplication_with_llm, "final_deduplicated_factor_book"], + [factor_dict_deduplication_with_llm_and_viable, "final_deduplicated_viable_factor_book"], + ] + + for dump_md in dump_md_list: + factor_name_set = set() + current_index = 1 + target_dict = dump_md[0] + json.dump(target_dict, open(f"{dump_md[1]}.json", "w"), indent=4) + with open( + rf"{dump_md[1]}.md", + "w", + ) as fw: + for factor_name in target_dict: + formulation = target_dict[factor_name]["formulation"] + if factor_name in formulation: + target_factor_name = factor_name.replace("_", "\_") + formulation = formulation.replace(factor_name, target_factor_name) + for variable in target_dict[factor_name]["variables"]: + if variable in formulation: + target_variable = variable.replace("_", "\_") + formulation = formulation.replace(variable, target_variable) + + fw.write(f"## {current_index}. 因子名称:{factor_name}\n") + fw.write(f"### Viability: {target_dict[factor_name]['viability']}\n") + fw.write(f"### Viability Reason: {target_dict[factor_name]['viability_reason']}\n") + fw.write(f"### description: {target_dict[factor_name]['description']}\n") + fw.write(f"### formulation: $$ {formulation} $$\n") + fw.write(f"### formulation string: {formulation}\n") + # write a table of variable and its description + + fw.write(f"### variable tables: \n") + fw.write(f"| variable | description |\n") + fw.write(f"| -------- | ----------- |\n") + for variable in target_dict[factor_name]["variables"]: + fw.write(f"| {variable} | {target_dict[factor_name]['variables'][variable]} |\n") + + current_index += 1 diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py index e69de29b..577dd325 100644 --- a/rdagent/core/conf.py +++ b/rdagent/core/conf.py @@ -0,0 +1,47 @@ +# TODO: use pydantic for other modules in Qlib +# from pydantic_settings import BaseSettings +import os +from typing import Union + +from dotenv import load_dotenv + +# make sure that env variable is loaded while calling Config() +load_dotenv(verbose=True, override=True) + +from pydantic_settings import BaseSettings + + +class FincoSettings(BaseSettings): + use_azure: bool = True + max_retry: int = 10 + retry_wait_seconds: int = 1 + continuous_mode: bool = False + dump_chat_cache: bool = False + use_chat_cache: bool = False + dump_embedding_cache: bool = False + use_embedding_cache: bool = False + prompt_cache_path: str = os.getcwd() + "/prompt_cache.db" + session_cache_folder_location: str = os.getcwd() + "/session_cache_folder/" + max_past_message_include: int = 10 + + log_llm_chat_content: bool = True + + # Chat configs + chat_openai_api_key: str = "" + chat_azure_api_base: str = "" + chat_azure_api_version: str = "" + chat_model: str = "" + chat_max_tokens: int = 3000 + chat_temperature: float = 0.5 + chat_stream: bool = True + chat_seed: Union[int, None] = None + chat_frequency_penalty: float = 0.0 + chat_presence_penalty: float = 0.0 + + default_system_prompt: str = "You are an AI assistant who helps to answer user's questions about finance." + + # Embedding configs + embedding_openai_api_key: str = "" + embedding_azure_api_base: str = "" + embedding_azure_api_version: str = "" + embedding_model: str = "" diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py index e69de29b..1bc90044 100644 --- a/rdagent/core/evolving_framework.py +++ b/rdagent/core/evolving_framework.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import copy +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + + +class Feedback: + pass + + +class Knowledge: + pass + + +class QueriedKnowledge: + pass + + +class KnowledgeBase(ABC): + @abstractmethod + def query( + self, + ) -> QueriedKnowledge | None: + raise NotImplementedError + + +class EvolvableSubjects: + """The target object to be evolved""" + + def clone(self) -> EvolvableSubjects: + return copy.deepcopy(self) + + +class QlibEvolvableSubjects(EvolvableSubjects): ... + + +class Evaluator(ABC): + """Both external EvolvableSubjects and internal evovler, it is + + FAQ: + - Q: If we have a external whitebox evaluator, do we need a + intenral EvolvableSubjects? + A: When the external evovler is very complex, maybe a internal LLM-based evovler + may provide more understandable feedbacks. + """ + + @abstractmethod + def evaluate(self, evo: EvolvableSubjects, **kwargs: Any) -> Feedback: + raise NotImplementedError + + +class SelfEvaluator(Evaluator): + pass + + +@dataclass +class EvoStep: + """At a specific step, + based on + - previous trace + - newly RAG kownledge `QueriedKnowledge` + + the EvolvableSubjects is evolved to a new one `EvolvableSubjects`. + + (optional) After evaluation, we get feedback `feedback`. + """ + + evolvable_subjects: EvolvableSubjects + queried_knowledge: QueriedKnowledge | None = None + feedback: Feedback | None = None + + +class EvolvingStrategy(ABC): + @abstractmethod + def evolve( + self, + *evo: EvolvableSubjects, + evolving_trace: list[EvoStep] | None = None, + queried_knowledge: QueriedKnowledge | None = None, + **kwargs: Any, + ) -> EvolvableSubjects: + """The evolving trace is a list of (evolvable_subjects, feedback) ordered + according to the time. + + The reason why the parameter is important for the evolving. + - evolving_trace: the historical feedback is important. + - queried_knowledge: queried knowledge + """ + + +class RAGStrategy(ABC): + """Retrival Augmentation Generation Strategy""" + + def __init__(self, knowledgebase: KnowledgeBase) -> None: + self.knowledgebase = knowledgebase + + @abstractmethod + def query( + self, + evo: EvolvableSubjects, + evolving_trace: list[EvoStep], + **kwargs: Any, + ) -> QueriedKnowledge | None: + pass + + @abstractmethod + def generate_knowledge( + self, + evolving_trace: list[EvoStep], + *, + return_knowledge: bool = False, + **kwargs: Any, + ) -> Knowledge | None: + """Generating new knowledge based on the evolving trace. + - It is encouraged to query related knowledge before generating new knowledge. + + RAGStrategy should maintain the new knowledge all by itself. + """ + + +class EvoAgent: + """It is responsible for driving the workflow.""" + + evolving_trace: list[EvoStep] + + def __init__( + self, + evolving_strategy: EvolvingStrategy, + rag: RAGStrategy | None = None, + ) -> None: + self.evolving_trace = [] + self.evolving_strategy = evolving_strategy + self.rag = rag + + def step_evolving( + self, + evo: EvolvableSubjects, + eva: Evaluator | Feedback, + *, + with_knowledge: bool = False, + with_feedback: bool = True, + knowledge_self_gen: bool = False, + ) -> EvolvableSubjects: + """Common evolving mode are supported in this api . + - Interactive evolving: + - `with_feedback=True` and `eva` is a external Evaluator. + + - Knowledge-driven evolving: + - `with_knowledge=True` and related knowledge are + queried based on `self.rag` + + - Self-evolving: we have two ways to self-evolve. + - 1) self generating knowledge and then evolve + - `knowledge_self_gen=True` and `with_knowledge=True` + - 2) self evaluate to generate feedback and then evolve + - `with_feedback=True` and `eva` is a internal Evaluator. + """ + # knowledge self-evolving + if knowledge_self_gen and self.rag is not None: + self.rag.generate_knowledge(self.evolving_trace) + + # RAG + queried_knowledge = None + if with_knowledge and self.rag is not None: + queried_knowledge = self.rag.query(evo, self.evolving_trace) + + # Evolve + evo = self.evolving_strategy.evolve( + evo=evo, + evolving_trace=self.evolving_trace, + queried_knowledge=queried_knowledge, + ) + es = EvoStep(evo, queried_knowledge) + + # Evaluate + if with_feedback: + es.feedback = eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge) + + # Update trace + self.evolving_trace.append(es) + return evo diff --git a/rdagent/core/log.py b/rdagent/core/log.py new file mode 100644 index 00000000..535edd07 --- /dev/null +++ b/rdagent/core/log.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from contextlib import contextmanager +from typing import TYPE_CHECKING, Generator, Sequence + +from loguru import logger + + +if TYPE_CHECKING: + from loguru import Logger + + +class LogColors: + """ + ANSI color codes for use in console output. + """ + + RED = "\033[91m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + MAGENTA = "\033[95m" + CYAN = "\033[96m" + WHITE = "\033[97m" + GRAY = "\033[90m" + BLACK = "\033[30m" + + BOLD = "\033[1m" + ITALIC = "\033[3m" + + END = "\033[0m" + + @classmethod + def get_all_colors(cls: type[LogColors]) -> list: + names = dir(cls) + names = [name for name in names if not name.startswith("__") and not callable(getattr(cls, name))] + return [getattr(cls, name) for name in names] + + def render(self, text: str, color: str = "", style: str = "") -> str: + """ + render text by input color and style. + It's not recommend that input text is already rendered. + """ + # This method is called too frequently, which is not good. + colors = self.get_all_colors() + # Perhaps color and font should be distinguished here. + if color and color in colors: + # Changes to accommodate ruff checks. + # Original code: + # raise ValueError(f"color should be in: {colors} but now is: {color}") + # Description of the problem: + # TRY003 Avoid specifying long messages outside the exception class + # EM102 Exception must not use an f-string literal, assign to variable first + # References: + # https://docs.astral.sh/ruff/rules/raise-vanilla-args/ + # https://docs.astral.sh/ruff/rules/f-string-in-exception/ + error_message = f"color should be in: {colors} but now is: {color}" + raise ValueError(error_message) + if style and style in colors: + # Changes to accommodate ruff checks. + # Original code: + # raise ValueError(f"style should be in: {colors} but now is: {style}") + # Description of the problem: + # TRY003 Avoid specifying long messages outside the exception class + # EM102 Exception must not use an f-string literal, assign to variable first + # References: + # https://docs.astral.sh/ruff/rules/raise-vanilla-args/ + # https://docs.astral.sh/ruff/rules/f-string-in-exception/ + error_message = f"style should be in: {colors} but now is: {style}" + raise ValueError(error_message) + + text = f"{color}{text}{self.END}" + + return f"{style}{text}{self.END}" + + +class FinCoLog: + # logger.add(loguru_handler, level="INFO") # you can add use storage as a loguru handler + + def __init__(self) -> None: + self.logger: Logger = logger + + def info(self, *args: Sequence, plain: bool = False, title: str = "Info") -> None: + if plain: + return self.plain_info(*args) + for arg in args: + # Changes to accommodate ruff checks. + # Original code: + # self.logger.info(f"{LogColors.WHITE}{arg}{LogColors.END}") + # Description of the problem: + # G004 Logging statement uses f-string + # References: + # https://docs.astral.sh/ruff/rules/logging-f-string/ + info = f"{LogColors.WHITE}{arg}{LogColors.END}" + self.logger.info(info) + return None + + def __getstate__(self) -> dict: + return {} + + # Changes to accommodate ruff checks. + # Original code: def __setstate__(self, _: str) -> None: + # Description of the problem: + # PLE0302 The special method `__setstate__` expects 2 parameters, 1 was given + # References: https://docs.astral.sh/ruff/rules/unexpected-special-method-signature/ + def __setstate__(self, _: str) -> None: + self.logger = logger + + def plain_info(self, *args: Sequence) -> None: + for arg in args: + # Changes to accommodate ruff checks. + # Original code: + # self.logger.info( + # f""" + # {LogColors.YELLOW}{LogColors.BOLD} + # Info:{LogColors.END}{LogColors.WHITE}{arg}{LogColors.END} + # """, + # ) + # Description of the problem: + # G004 Logging statement uses f-string + # References: + # https://docs.astral.sh/ruff/rules/logging-f-string/ + info = f""" + {LogColors.YELLOW}{LogColors.BOLD} + Info:{LogColors.END}{LogColors.WHITE}{arg}{LogColors.END} + """ + self.logger.info(info) + + def warning(self, *args: Sequence) -> None: + for arg in args: + # Changes to accommodate ruff checks. + # Original code: + # self.logger.warning( + # f"{LogColors.BLUE}{LogColors.BOLD}Warning:{LogColors.END}{arg}", + # ) + # Description of the problem: + # G004 Logging statement uses f-string + # References: + # https://docs.astral.sh/ruff/rules/logging-f-string/ + info = f"{LogColors.BLUE}{LogColors.BOLD}Warning:{LogColors.END}{arg}" + self.logger.warning(info) + + def error(self, *args: Sequence) -> None: + for arg in args: + # Changes to accommodate ruff checks. + # Original code: + # self.logger.error( + # f"{LogColors.RED}{LogColors.BOLD}Error:{LogColors.END}{arg}", + # ) + # Description of the problem: + # G004 Logging statement uses f-string + # References: + # https://docs.astral.sh/ruff/rules/logging-f-string/ + info = f"{LogColors.RED}{LogColors.BOLD}Error:{LogColors.END}{arg}" + self.logger.error(info) diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py new file mode 100644 index 00000000..ace5d83f --- /dev/null +++ b/rdagent/core/utils.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import json +import os +import random +import string +import uuid +from pathlib import Path +import importlib +from typing import Any + +import yaml +from fuzzywuzzy import fuzz + +import multiprocessing as mp +from collections.abc import Callable + + +class FincoException(Exception): + pass + + +class SingletonMeta(type): + _instance = None + + def __call__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super(SingletonMeta, cls).__call__(*args, **kwargs) + return cls._instance + + +class SingletonBaseClass(metaclass=SingletonMeta): + """ + Because we try to support defining Singleton with `class A(SingletonBaseClass)` instead of `A(metaclass=SingletonMeta)` + This class becomes necessary + + """ + + # TODO: Add move this class to Qlib's general utils. + + +def parse_json(response): + try: + return json.loads(response) + except json.decoder.JSONDecodeError: + pass + + raise Exception(f"Failed to parse response: {response}, please report it or help us to fix it.") + + +def similarity(text1, text2): + text1 = text1 if isinstance(text1, str) else "" + text2 = text2 if isinstance(text2, str) else "" + + # Maybe we can use other similarity algorithm such as tfidf + return fuzz.ratio(text1, text2) + + +def random_string(length=10): + letters = string.ascii_letters + string.digits + return "".join(random.choice(letters) for i in range(length)) + + +def remove_uncommon_keys(new_dict, org_dict): + keys_to_remove = [] + + for key in new_dict: + if key not in org_dict: + keys_to_remove.append(key) + elif isinstance(new_dict[key], dict) and isinstance(org_dict[key], dict): + remove_uncommon_keys(new_dict[key], org_dict[key]) + elif isinstance(new_dict[key], dict) and isinstance(org_dict[key], str): + new_dict[key] = org_dict[key] + + for key in keys_to_remove: + del new_dict[key] + + +def crawl_the_folder(folder_path: Path): + yaml_files = [] + for root, _, files in os.walk(folder_path.as_posix()): + for file in files: + if file.endswith(".yaml") or file.endswith(".yml"): + yaml_file_path = Path(os.path.join(root, file)).relative_to(folder_path) + yaml_files.append(yaml_file_path.as_posix()) + return sorted(yaml_files) + + +def compare_yaml(file1, file2): + with open(file1, "r") as stream: + data1 = yaml.safe_load(stream) + with open(file2, "r") as stream: + data2 = yaml.safe_load(stream) + return data1 == data2 + + +def remove_keys(valid_keys, ori_dict): + for key in list(ori_dict.keys()): + if key not in valid_keys: + ori_dict.pop(key) + return ori_dict + + +class YamlConfigCache(SingletonBaseClass): + def __init__(self) -> None: + super().__init__() + self.path_to_config = dict() + + def load(self, path): + with open(path, "r") as stream: + data = yaml.safe_load(stream) + self.path_to_config[path] = data + + def __getitem__(self, path): + if path not in self.path_to_config: + self.load(path) + return self.path_to_config[path] + + +def import_class(class_path: str) -> Any: + """ + Parameters + ---------- + class_path : str + class path like"scripts.factor_implementation.baselines.naive.one_shot.OneshotFactorGen" + + Returns + ------- + class of `class_path` + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) -> list: + """It will use multiprocessing to call the functions in func_calls with the given parameters. + The results equals to `return [f(*args) for f, args in func_calls]` + It will not call multiprocessing if `n=1` + + Parameters + ---------- + func_calls : List[Tuple[Callable, Tuple]] + the list of functions and their parameters + n : int + the number of subprocesses + + Returns + ------- + list + + """ + if n == 1: + return [f(*args) for f, args in func_calls] + with mp.Pool(processes=n) as pool: + results = [pool.apply_async(f, args) for f, args in func_calls] + return [result.get() for result in results] + + +# You can test the above function +# def f(x): +# return x**2 +# +# if __name__ == "__main__": +# print(multiprocessing_wrapper([(f, (i,)) for i in range(10)], 4)) diff --git a/rdagent/data_driven_code_generation/__init__.py b/rdagent/document_process/__init__.py similarity index 100% rename from rdagent/data_driven_code_generation/__init__.py rename to rdagent/document_process/__init__.py diff --git a/rdagent/document_process/document_analysis.py b/rdagent/document_process/document_analysis.py new file mode 100644 index 00000000..20d92dc4 --- /dev/null +++ b/rdagent/document_process/document_analysis.py @@ -0,0 +1,615 @@ +from __future__ import annotations + +import json +import multiprocessing as mp +import re +from pathlib import Path +from typing import TYPE_CHECKING, Mapping + +import numpy as np +import pandas as pd +import tiktoken +import yaml +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from jinja2 import Template +from sklearn.cluster import KMeans +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.preprocessing import normalize + +from core.conf import FincoSettings as Config +from oai.llm_utils import APIBackend, create_embedding_with_multiprocessing +from core.log import FinCoLog + +if TYPE_CHECKING: + from langchain_core.documents import Document + +from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader + +with (Path(__file__).parent / "util_prompt.yaml").open(encoding="utf8") as f: + UTIL_PROMPT = yaml.safe_load( + f, + ) + + +def load_documents_by_langchain(path: Path) -> list: + """Load documents from the specified path. + + Args: + path (str): The path to the directory or file containing the documents. + + Returns: + list: A list of loaded documents. + """ + loader = PyPDFDirectoryLoader(str(path), silent_errors=True) if path.is_dir() else PyPDFLoader(str(path)) + return loader.load() + + +def process_documents_by_langchain(docs: list[Document]) -> dict[str, str]: + """Process a list of documents and group them by document name. + + Args: + docs (list): A list of documents. + + Returns: + dict: A dictionary where the keys are document names and the values are + the concatenated content of the documents. + """ + content_dict = {} + + for doc in docs: + doc_name = str(Path(doc.metadata["source"]).resolve()) + doc_content = doc.page_content + + if doc_name not in content_dict: + content_dict[str(doc_name)] = doc_content + else: + content_dict[str(doc_name)] += doc_content + + return content_dict + + +def load_and_process_pdfs_by_langchain(path: Path) -> dict[str, str]: + return process_documents_by_langchain(load_documents_by_langchain(path)) + + +def load_and_process_one_pdf_by_azure_document_intelligence( + path: Path, + key: str, + endpoint: str, +) -> str: + pages = len(PyPDFLoader(str(path)).load()) + document_analysis_client = DocumentAnalysisClient( + endpoint=endpoint, + credential=AzureKeyCredential(key), + ) + + with path.open("rb") as file: + result = document_analysis_client.begin_analyze_document( + "prebuilt-document", + file, + pages=f"1-{pages}", + ).result() + return result.content + + +def load_and_process_pdfs_by_azure_document_intelligence(path: Path) -> dict[str, str]: + config = Config() + + assert config.azure_document_intelligence_key is not None + assert config.azure_document_intelligence_endpoint is not None + + content_dict = {} + ab_path = path.resolve() + if ab_path.is_file(): + assert ".pdf" in ab_path.suffixes, "The file must be a PDF file." + proc = load_and_process_one_pdf_by_azure_document_intelligence + content_dict[str(ab_path)] = proc( + ab_path, + config.azure_document_intelligence_key, + config.azure_document_intelligence_endpoint, + ) + else: + for file_path in ab_path.rglob("*"): + if file_path.is_file() and ".pdf" in file_path.suffixes: + content_dict[str(file_path)] = load_and_process_one_pdf_by_azure_document_intelligence( + file_path, + config.azure_document_intelligence_key, + config.azure_document_intelligence_endpoint, + ) + return content_dict + + +def classify_report_from_dict( + report_dict: Mapping[str, str], + api: APIBackend, + input_max_token: int = 128000, + vote_time: int = 1, + substrings: tuple[str] = (), +) -> dict[str, dict[str, str]]: + """ + Parameters: + - report_dict (Dict[str, str]): + A dictionary where the key is the path of the report (ending with .pdf), + and the value is either the report content as a string. + - api (APIBackend): An instance of the APIBackend class. + - input_max_token (int): Specifying the maximum number of input tokens. + - vote_time (int): An integer specifying how many times to vote. + - substrings (list(str)): List of hardcode substrings. + + Returns: + - Dict[str, Dict[str, str]]: A dictionary where each key is the path of the report, + with a single key 'class' and its value being the classification result (0 or 1). + + """ + if len(substrings) == 0: + substrings = ( + "FinCo", + "金融工程", + "金工", + "回测", + "因子", + "机器学习", + "深度学习", + "量化", + ) + + res_dict = {} + classify_prompt = UTIL_PROMPT["classify_system"] + enc = tiktoken.encoding_for_model("gpt-4-turbo") + + for key, value in report_dict.items(): + if not key.endswith(".pdf"): + continue + file_name = key + + if isinstance(value, str): + content = value + else: + FinCoLog().warning(f"输入格式不符合要求: {file_name}") + res_dict[file_name] = {"class": 0} + continue + + if not any(substring in content for substring in substrings): + res_dict[file_name] = {"class": 0} + else: + gpt_4_max_token = 128000 + if input_max_token < gpt_4_max_token: + content = enc.encode(content) + max_token_1 = max(0, min(len(content), input_max_token) - 1) + content = enc.decode(content[:max_token_1]) + + vote_list = [] + for _ in range(vote_time): + user_prompt = content + system_prompt = classify_prompt + res = api.build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=True, + ) + try: + res = json.loads(res) + vote_list.append(int(res["class"])) + except json.JSONDecodeError: + FinCoLog().warning(f"返回值无法解析: {file_name}") + res_dict[file_name] = {"class": 0} + count_0 = vote_list.count(0) + count_1 = vote_list.count(1) + if max(count_0, count_1) > int(vote_time / 2): + break + + result = 1 if count_1 > count_0 else 0 + res_dict[file_name] = {"class": result} + + return res_dict + + +def __extract_factors_name_and_desc_from_content( + content: str, +) -> dict[str, dict[str, str]]: + session = APIBackend().build_chat_session( + session_system_prompt=UTIL_PROMPT["extract_factors_system"], + ) + + extracted_factor_dict = {} + current_user_prompt = content + + for _ in range(10): + extract_result_resp = session.build_chat_completion( + user_prompt=current_user_prompt, + json_mode=False, + ) + re_search_res = re.search(r"```json(.*)```", extract_result_resp, re.S) + ret_json_str = re_search_res.group(1) if re_search_res is not None else "" + try: + ret_dict = json.loads(ret_json_str) + parse_success = bool(isinstance(ret_dict, dict)) and "factors" in ret_dict + except json.JSONDecodeError: + parse_success = False + if ret_json_str is None or not parse_success: + current_user_prompt = ( + "Your response didn't follow the instruction" " might be wrong json format. Try again." + ) + else: + factors = ret_dict["factors"] + if len(factors) == 0: + break + for factor_name, factor_description in factors.items(): + extracted_factor_dict[factor_name] = factor_description + current_user_prompt = UTIL_PROMPT["extract_factors_follow_user"] + + return extracted_factor_dict + + +def __extract_factors_formulation_from_content( + content: str, + factor_dict: dict[str, str], +) -> dict[str, dict[str, str]]: + factor_dict_df = pd.DataFrame( + factor_dict.items(), + columns=["factor_name", "factor_description"], + ) + + system_prompt = UTIL_PROMPT["extract_factor_formulation_system"] + current_user_prompt = Template( + UTIL_PROMPT["extract_factor_formulation_user"], + ).render(report_content=content, factor_dict=factor_dict_df.to_string()) + + session = APIBackend().build_chat_session(session_system_prompt=system_prompt) + factor_to_formulation = {} + + for _ in range(10): + extract_result_resp = session.build_chat_completion( + user_prompt=current_user_prompt, + json_mode=False, + ) + re_search_res = re.search(r"```json(.*)```", extract_result_resp, re.S) + ret_json_str = re_search_res.group(1) if re_search_res is not None else "" + try: + ret_dict = json.loads(ret_json_str) + parse_success = bool(isinstance(ret_dict, dict)) + except json.JSONDecodeError: + parse_success = False + if ret_json_str is None or not parse_success: + current_user_prompt = ( + "Your response didn't follow the instruction" " might be wrong json format. Try again." + ) + else: + for name, formulation_and_description in ret_dict.items(): + if name in factor_dict: + factor_to_formulation[name] = formulation_and_description + if len(factor_to_formulation) != len(factor_dict): + remain_df = factor_dict_df[~factor_dict_df["factor_name"].isin(factor_to_formulation)] + current_user_prompt = ( + "Some factors are missing. Please check the following" + " factors and their descriptions and continue extraction.\n" + "==========================Remaining factors" + "==========================\n" + remain_df.to_string() + ) + else: + break + + return factor_to_formulation + + +def extract_factor_and_formulation_from_one_report( + content: str, +) -> dict[str, dict[str, str]]: + final_factor_dict_to_one_report = {} + factor_dict = __extract_factors_name_and_desc_from_content(content) + if len(factor_dict) != 0: + factor_to_formulation = __extract_factors_formulation_from_content( + content, + factor_dict, + ) + for factor_name in factor_dict: + final_factor_dict_to_one_report.setdefault(factor_name, {}) + final_factor_dict_to_one_report[factor_name]["description"] = factor_dict[factor_name] + + # use code to correct _ in formulation + formulation = factor_to_formulation[factor_name]["formulation"] + if factor_name in formulation: + target_factor_name = factor_name.replace("_", r"\_") + formulation = formulation.replace(factor_name, target_factor_name) + for variable in factor_to_formulation[factor_name]["variables"]: + if variable in formulation: + target_variable = variable.replace("_", r"\_") + formulation = formulation.replace(variable, target_variable) + + final_factor_dict_to_one_report[factor_name]["formulation"] = formulation + final_factor_dict_to_one_report[factor_name]["variables"] = factor_to_formulation[factor_name]["variables"] + + return final_factor_dict_to_one_report + + +def extract_factors_from_report_dict_and_classify_result( + report_dict: dict[str, str], + useful_no_dict: dict[str, dict[str, str]], + n_proc: int = 11, +) -> dict[str, dict[str, dict[str, str]]]: + useful_report_dict = {} + for key, value in useful_no_dict.items(): + if isinstance(value, dict): + if int(value.get("class")) == 1: + useful_report_dict[key] = report_dict[key] + else: + FinCoLog().warning(f"输入格式不符合要求: {key}") + + final_report_factor_dict = {} + # for file_name, content in useful_report_dict.items(): + # final_report_factor_dict.setdefault(file_name, {}) + # final_report_factor_dict[ + # file_name + # ] = extract_factor_and_formulation_from_one_report(content) + + while len(final_report_factor_dict) != len(useful_report_dict): + pool = mp.Pool(n_proc) + pool_result_list = [] + file_names = [] + for file_name, content in useful_report_dict.items(): + if file_name in final_report_factor_dict: + continue + file_names.append(file_name) + pool_result_list.append( + pool.apply_async( + extract_factor_and_formulation_from_one_report, + (content,), + ), + ) + + pool.close() + pool.join() + + for index, result in enumerate(pool_result_list): + if result.get is not None: + file_name = file_names[index] + final_report_factor_dict.setdefault(file_name, {}) + final_report_factor_dict[file_name] = result.get() + FinCoLog().info(f"已经完成{len(final_report_factor_dict)}个报告的因子提取") + + return final_report_factor_dict + + +def check_factor_dict_viability_simulate_json_mode( + factor_df_string: str, +) -> dict[str, dict[str, str]]: + session = APIBackend().build_chat_session( + session_system_prompt=UTIL_PROMPT["factor_viability_system"], + ) + current_user_prompt = factor_df_string + + for _ in range(10): + extract_result_resp = session.build_chat_completion( + user_prompt=current_user_prompt, + json_mode=False, + ) + re_search_res = re.search(r"```json(.*)```", extract_result_resp, re.S) + ret_json_str = re_search_res.group(1) if re_search_res is not None else "" + try: + ret_dict = json.loads(ret_json_str) + parse_success = bool(isinstance(ret_dict, dict)) + except json.JSONDecodeError: + parse_success = False + if ret_json_str is None or not parse_success: + current_user_prompt = ( + "Your response didn't follow the " "instruction might be wrong json format. Try again." + ) + else: + return ret_dict + return {} + + +def check_factor_dict_viability( + factor_dict: dict[str, dict[str, str]], +) -> dict[str, dict[str, str]]: + factor_viability_dict = {} + + factor_df = pd.DataFrame(factor_dict).T + factor_df.index.names = ["factor_name"] + + while factor_df.shape[0] > 0: + pool = mp.Pool(8) + + result_list = [] + for i in range(0, factor_df.shape[0], 50): + target_factor_df_string = factor_df.iloc[i : i + 50, :].to_string() + + result_list.append( + pool.apply_async( + check_factor_dict_viability_simulate_json_mode, + (target_factor_df_string,), + ), + ) + + pool.close() + pool.join() + + for result in result_list: + respond = result.get() + for factor_name, viability in respond.items(): + factor_viability_dict[factor_name] = viability + + factor_df = factor_df[~factor_df.index.isin(factor_viability_dict)] + + return factor_viability_dict + + +def check_factor_duplication_simulate_json_mode( + factor_df: pd.DataFrame, +) -> list[list[str]]: + session = APIBackend().build_chat_session( + session_system_prompt=UTIL_PROMPT["factor_duplicate_system"], + ) + current_user_prompt = factor_df.to_string() + + generated_duplicated_groups = [] + for _ in range(20): + extract_result_resp = session.build_chat_completion( + user_prompt=current_user_prompt, + json_mode=False, + ) + re_search_res = re.search(r"```json(.*)```", extract_result_resp, re.S) + ret_json_str = re_search_res.group(1) if re_search_res is not None else "" + try: + ret_dict = json.loads(ret_json_str) + parse_success = bool(isinstance(ret_dict, list)) + except json.JSONDecodeError: + parse_success = False + if ret_json_str is None or not parse_success: + current_user_prompt = ( + "Your previous response didn't follow" + " the instruction might be wrong json" + " format. Try reducing the factors." + ) + elif len(ret_dict) == 0: + return generated_duplicated_groups + else: + generated_duplicated_groups.extend(ret_dict) + current_user_prompt = ( + "Continue to extract duplicated" + " groups. If no more duplicated group" + " found please respond empty dict." + ) + return generated_duplicated_groups + + +def kmeans_embeddings(embeddings: np.ndarray, k: int = 20) -> list[list[str]]: + x_normalized = normalize(embeddings) + + kmeans = KMeans( + n_clusters=k, + init="random", + max_iter=100, + n_init=10, + random_state=42, + ) + + # KMeans算法使用欧氏距离, 我们需要自定义一个函数来找到最相似的簇中心 + def find_closest_cluster_cosine_similarity( + data: np.ndarray, + centroids: np.ndarray, + ) -> np.ndarray: + similarity = cosine_similarity(data, centroids) + return np.argmax(similarity, axis=1) + + # 初始化簇中心 + rng = np.random.default_rng() + centroids = rng.choice(x_normalized, size=k, replace=False) + + # 迭代直到收敛或达到最大迭代次数 + for _ in range(kmeans.max_iter): + # 分配样本到最近的簇中心 + closest_clusters = find_closest_cluster_cosine_similarity( + x_normalized, + centroids, + ) + + # 更新簇中心 + new_centroids = np.array( + [x_normalized[closest_clusters == i].mean(axis=0) for i in range(k)], + ) + new_centroids = normalize(new_centroids) # 归一化新的簇中心 + + # 检查簇中心是否发生变化 + if np.allclose(centroids, new_centroids): + break + + centroids = new_centroids + + clusters = find_closest_cluster_cosine_similarity(x_normalized, centroids) + cluster_to_index = {} + for index, cluster in enumerate(clusters): + cluster_to_index.setdefault(cluster, []).append(index) + return sorted( + cluster_to_index.values(), + key=lambda x: len(x), + reverse=True, + ) + + +def deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[list[str]]: + factor_df = pd.DataFrame(factor_dict).T + factor_df.index.names = ["factor_name"] + + factor_names = sorted(factor_dict) + + factor_name_to_full_str = {} + for factor_name in factor_dict: + description = factor_dict[factor_name]["description"] + formulation = factor_dict[factor_name]["formulation"] + variables = factor_dict[factor_name]["variables"] + factor_name_to_full_str[ + factor_name + ] = f"""Factor name: {factor_name} +Factor description: {description} +Factor formulation: {formulation} +Factor variables: {variables} +""" + + full_str_list = [factor_name_to_full_str[factor_name] for factor_name in factor_names] + embeddings = create_embedding_with_multiprocessing(full_str_list) + + target_k = None + if len(full_str_list) < Config().max_input_duplicate_factor_group: + kmeans_index_group = [list(range(len(full_str_list)))] + target_k = 1 + else: + for k in range( + len(full_str_list) // Config().max_input_duplicate_factor_group, + 30, + ): + kmeans_index_group = kmeans_embeddings(embeddings=embeddings, k=k) + if len(kmeans_index_group[0]) < Config().max_input_duplicate_factor_group: + target_k = k + FinCoLog().info(f"K-means group number: {k}") + break + factor_name_groups = [[factor_names[index] for index in index_group] for index_group in kmeans_index_group] + + duplication_names_list = [] + + pool = mp.Pool(target_k) + result_list = [] + result_list = [ + pool.apply_async( + check_factor_duplication_simulate_json_mode, + (factor_df.loc[factor_name_group, :],), + ) + for factor_name_group in factor_name_groups + ] + + pool.close() + pool.join() + + for result in result_list: + deduplication_factor_names_list = result.get() + for deduplication_factor_names in deduplication_factor_names_list: + filter_factor_names = [ + factor_name for factor_name in set(deduplication_factor_names) if factor_name in factor_dict + ] + if len(filter_factor_names) > 1: + duplication_names_list.append(filter_factor_names) + + return duplication_names_list + + +def deduplicate_factors_several_times( + factor_dict: dict[str, dict[str, str]], +) -> list[list[str]]: + final_duplication_names_list = [] + current_round_factor_dict = factor_dict + for _ in range(10): + duplication_names_list = deduplicate_factor_dict(current_round_factor_dict) + + new_round_names = [] + for duplication_names in duplication_names_list: + if len(duplication_names) < Config().max_output_duplicate_factor_group: + final_duplication_names_list.append(duplication_names) + else: + new_round_names.extend(duplication_names) + + if len(new_round_names) != 0: + current_round_factor_dict = {factor_name: factor_dict[factor_name] for factor_name in new_round_names} + else: + return final_duplication_names_list + return [] diff --git a/rdagent/document_process/document_reader.py b/rdagent/document_process/document_reader.py new file mode 100644 index 00000000..88f1fe45 --- /dev/null +++ b/rdagent/document_process/document_reader.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from pathlib import Path + +import yaml +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential + +from finco.conf import FincoSettings as Config +from oai.llm_utils import APIBackend, create_embedding_with_multiprocessing +from core.log import FinCoLog + +if TYPE_CHECKING: + from langchain_core.documents import Document + +from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader + +with (Path(__file__).parent / "util_prompt.yaml").open(encoding="utf8") as f: + UTIL_PROMPT = yaml.safe_load( + f, + ) + + +def load_documents_by_langchain(path: Path) -> list: + """Load documents from the specified path. + + Args: + path (str): The path to the directory or file containing the documents. + + Returns: + list: A list of loaded documents. + """ + loader = PyPDFDirectoryLoader(str(path), silent_errors=True) if path.is_dir() else PyPDFLoader(str(path)) + return loader.load() + + +def process_documents_by_langchain(docs: list[Document]) -> dict[str, str]: + """Process a list of documents and group them by document name. + + Args: + docs (list): A list of documents. + + Returns: + dict: A dictionary where the keys are document names and the values are + the concatenated content of the documents. + """ + content_dict = {} + + for doc in docs: + doc_name = str(Path(doc.metadata["source"]).resolve()) + doc_content = doc.page_content + + if doc_name not in content_dict: + content_dict[str(doc_name)] = doc_content + else: + content_dict[str(doc_name)] += doc_content + + return content_dict + + +def load_and_process_pdfs_by_langchain(path: Path) -> dict[str, str]: + return process_documents_by_langchain(load_documents_by_langchain(path)) + + +def load_and_process_one_pdf_by_azure_document_intelligence( + path: Path, + key: str, + endpoint: str, +) -> str: + pages = len(PyPDFLoader(str(path)).load()) + document_analysis_client = DocumentAnalysisClient( + endpoint=endpoint, + credential=AzureKeyCredential(key), + ) + + with path.open("rb") as file: + result = document_analysis_client.begin_analyze_document( + "prebuilt-document", + file, + pages=f"1-{pages}", + ).result() + return result.content + + +def load_and_process_pdfs_by_azure_document_intelligence(path: Path) -> dict[str, str]: + config = Config() + + assert config.azure_document_intelligence_key is not None + assert config.azure_document_intelligence_endpoint is not None + + content_dict = {} + ab_path = path.resolve() + if ab_path.is_file(): + assert ".pdf" in ab_path.suffixes, "The file must be a PDF file." + proc = load_and_process_one_pdf_by_azure_document_intelligence + content_dict[str(ab_path)] = proc( + ab_path, + config.azure_document_intelligence_key, + config.azure_document_intelligence_endpoint, + ) + else: + for file_path in ab_path.rglob("*"): + if file_path.is_file() and ".pdf" in file_path.suffixes: + content_dict[str(file_path)] = load_and_process_one_pdf_by_azure_document_intelligence( + file_path, + config.azure_document_intelligence_key, + config.azure_document_intelligence_endpoint, + ) + return content_dict diff --git a/rdagent/document_analysis/__init__.py b/rdagent/factor_implementation/evolving/__init__.py similarity index 100% rename from rdagent/document_analysis/__init__.py rename to rdagent/factor_implementation/evolving/__init__.py diff --git a/rdagent/factor_implementation/evolving/evaluators.py b/rdagent/factor_implementation/evolving/evaluators.py new file mode 100644 index 00000000..882564d8 --- /dev/null +++ b/rdagent/factor_implementation/evolving/evaluators.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import re +from typing import List + +from pandas.core.api import DataFrame as DataFrame + +from core.evolving_framework import Evaluator as EvolvingEvaluator +from core.evolving_framework import Feedback, QueriedKnowledge +from core.log import FinCoLog +from factor_implementation.evolving.evolvable_subjects import ( + FactorImplementationList, +) +from factor_implementation.share_modules.conf import FactorImplementSettings +from factor_implementation.share_modules.evaluator import ( + Evaluator as FactorImplementationEvaluator, +) +from factor_implementation.share_modules.evaluator import ( + FactorImplementationCodeEvaluator, + FactorImplementationFinalDecisionEvaluator, + FactorImplementationValueEvaluator, +) +from factor_implementation.share_modules.factor import ( + FactorImplementation, + FactorImplementationTask, +) +from core.utils import multiprocessing_wrapper + + +class FactorImplementationSingleFeedback: + """This class is a feedback to single implementation which is generated from an evaluator.""" + + def __init__( + self, + execution_feedback: str = None, + value_generated_flag: bool = False, + code_feedback: str = None, + factor_value_feedback: str = None, + final_decision: bool = None, + final_feedback: str = None, + final_decision_based_on_gt: bool = None, + ) -> None: + self.execution_feedback = execution_feedback + self.value_generated_flag = value_generated_flag + self.code_feedback = code_feedback + self.factor_value_feedback = factor_value_feedback + self.final_decision = final_decision + self.final_feedback = final_feedback + self.final_decision_based_on_gt = final_decision_based_on_gt + + def __str__(self) -> str: + return f"""------------------Factor Execution Feedback------------------ +{self.execution_feedback} +------------------Factor Code Feedback------------------ +{self.code_feedback} +------------------Factor Value Feedback------------------ +{self.factor_value_feedback} +------------------Factor Final Feedback------------------ +{self.final_feedback} +------------------Factor Final Decision------------------ +This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}. +""" + + +class FactorImplementationsMultiFeedback( + Feedback, + List[FactorImplementationSingleFeedback], +): + """Feedback contains a list, each element is the corresponding feedback for each factor implementation.""" + + +class FactorImplementationEvaluatorV1(FactorImplementationEvaluator): + """This class is the v1 version of evaluator for a single factor implementation. + It calls several evaluators in share modules to evaluate the factor implementation. + """ + + def __init__(self) -> None: + self.code_evaluator = FactorImplementationCodeEvaluator() + self.value_evaluator = FactorImplementationValueEvaluator() + self.final_decision_evaluator = FactorImplementationFinalDecisionEvaluator() + + def evaluate( + self, + target_task: FactorImplementationTask, + implementation: FactorImplementation, + gt_implementation: FactorImplementation = None, + queried_knowledge: QueriedKnowledge = None, + **kwargs, + ) -> FactorImplementationSingleFeedback: + if implementation is None: + return None + + target_task_information = target_task.get_factor_information() + if ( + queried_knowledge is not None + and target_task_information in queried_knowledge.success_task_to_knowledge_dict + ): + return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback + elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set: + return FactorImplementationSingleFeedback( + execution_feedback="This task has failed too many times, skip implementation.", + value_generated_flag=False, + code_feedback="This task has failed too many times, skip code evaluation.", + factor_value_feedback="This task has failed too many times, skip value evaluation.", + final_decision=False, + final_feedback="This task has failed too many times, skip final decision evaluation.", + final_decision_based_on_gt=False, + ) + else: + factor_feedback = FactorImplementationSingleFeedback() + ( + factor_feedback.execution_feedback, + source_df, + ) = implementation.execute() + + # Remove the long list of numbers in the feedback + pattern = r"(?<=\D)(,\s+-?\d+\.\d+){50,}(?=\D)" + factor_feedback.execution_feedback = re.sub(pattern, ", ", factor_feedback.execution_feedback) + execution_feedback_lines = [ + line for line in factor_feedback.execution_feedback.split("\n") if "warning" not in line.lower() + ] + factor_feedback.execution_feedback = "\n".join(execution_feedback_lines) + + if source_df is None: + factor_feedback.factor_value_feedback = "No factor value generated, skip value evaluation." + factor_feedback.value_generated_flag = False + value_decision = None + else: + factor_feedback.value_generated_flag = True + if gt_implementation is not None: + _, gt_df = gt_implementation.execute(store_result=True) + else: + gt_df = None + try: + source_df = source_df.sort_index() + if gt_df is not None: + gt_df = gt_df.sort_index() + ( + factor_feedback.factor_value_feedback, + value_decision, + ) = self.value_evaluator.evaluate(source_df=source_df, gt_df=gt_df) + except Exception as e: + FinCoLog().warning("Value evaluation failed with exception: %s", e) + factor_feedback.factor_value_feedback = "Value evaluation failed." + value_decision = False + + factor_feedback.final_decision_based_on_gt = gt_implementation is not None + + if value_decision is not None and value_decision is True: + # To avoid confusion, when value_decision is True, we do not need code feedback + factor_feedback.code_feedback = "Final decision is True and there are no code critics." + factor_feedback.final_decision = value_decision + factor_feedback.final_feedback = "Value evaluation passed, skip final decision evaluation." + else: + factor_feedback.code_feedback = self.code_evaluator.evaluate( + target_task=target_task, + implementation=implementation, + execution_feedback=factor_feedback.execution_feedback, + value_feedback=factor_feedback.factor_value_feedback, + gt_implementation=gt_implementation, + ) + ( + factor_feedback.final_decision, + factor_feedback.final_feedback, + ) = self.final_decision_evaluator.evaluate( + target_task=target_task, + execution_feedback=factor_feedback.execution_feedback, + value_feedback=factor_feedback.factor_value_feedback, + code_feedback=factor_feedback.code_feedback, + ) + return factor_feedback + + +class FactorImplementationsMultiEvaluator(EvolvingEvaluator): + def __init__(self, single_evaluator=FactorImplementationEvaluatorV1()) -> None: + super().__init__() + self.single_factor_implementation_evaluator = single_evaluator + + def evaluate( + self, + evo: FactorImplementationList, + queried_knowledge: QueriedKnowledge = None, + **kwargs, + ) -> FactorImplementationsMultiFeedback: + multi_implementation_feedback = FactorImplementationsMultiFeedback() + + # for index in range(len(evo.target_factor_tasks)): + # corresponding_implementation = evo.corresponding_implementations[index] + # corresponding_gt_implementation = ( + # evo.corresponding_gt_implementations[index] + # if evo.corresponding_gt_implementations is not None + # else None + # ) + + # multi_implementation_feedback.append( + # self.single_factor_implementation_evaluator.evaluate( + # target_task=evo.target_factor_tasks[index], + # implementation=corresponding_implementation, + # gt_implementation=corresponding_gt_implementation, + # queried_knowledge=queried_knowledge, + # ) + # ) + + calls = [] + for index in range(len(evo.target_factor_tasks)): + corresponding_implementation = evo.corresponding_implementations[index] + corresponding_gt_implementation = ( + evo.corresponding_gt_implementations[index] + if evo.corresponding_gt_implementations is not None + else None + ) + calls.append( + ( + self.single_factor_implementation_evaluator.evaluate, + ( + evo.target_factor_tasks[index], + corresponding_implementation, + corresponding_gt_implementation, + queried_knowledge, + ), + ), + ) + multi_implementation_feedback = multiprocessing_wrapper(calls, n=FactorImplementSettings().evo_multi_proc_n) + + final_decision = [ + None if single_feedback is None else single_feedback.final_decision + for single_feedback in multi_implementation_feedback + ] + print(f"Final decisions: {final_decision} True count: {final_decision.count(True)}") + + return multi_implementation_feedback diff --git a/rdagent/factor_implementation/evolving/evolvable_subjects.py b/rdagent/factor_implementation/evolving/evolvable_subjects.py new file mode 100644 index 00000000..af4ee731 --- /dev/null +++ b/rdagent/factor_implementation/evolving/evolvable_subjects.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import pandas as pd + +from core.evolving_framework import EvolvableSubjects +from core.log import FinCoLog +from factor_implementation.share_modules.factor import ( + FactorImplementation, + FactorImplementationTask, +) + + +class FactorImplementationList(EvolvableSubjects): + """ + Factors is a list. + """ + + def __init__( + self, + target_factor_tasks: list[FactorImplementationTask], + corresponding_gt_implementations: list[FactorImplementation] = None, + ): + super().__init__() + self.target_factor_tasks = target_factor_tasks + self.corresponding_implementations: list[FactorImplementation] = [] + if corresponding_gt_implementations is not None and len( + corresponding_gt_implementations, + ) != len(target_factor_tasks): + self.corresponding_gt_implementations = None + FinCoLog.warning( + "The length of corresponding_gt_implementations is not equal to the length of target_factor_tasks, set corresponding_gt_implementations to None", + ) + else: + self.corresponding_gt_implementations = corresponding_gt_implementations diff --git a/rdagent/factor_implementation/evolving/evolving_strategy.py b/rdagent/factor_implementation/evolving/evolving_strategy.py new file mode 100644 index 00000000..f07b5a8d --- /dev/null +++ b/rdagent/factor_implementation/evolving/evolving_strategy.py @@ -0,0 +1,298 @@ +from __future__ import annotations + +import json +import random +from abc import abstractmethod +from copy import deepcopy +from typing import TYPE_CHECKING + +from jinja2 import Template + +from core.evolving_framework import EvolvingStrategy, QueriedKnowledge +from oai.llm_utils import APIBackend +from factor_implementation.share_modules.conf import FactorImplementSettings +from factor_implementation.share_modules.factor import ( + FactorImplementation, + FactorImplementationTask, + FileBasedFactorImplementation, +) +from factor_implementation.share_modules.prompt import ( + FactorImplementationPrompts, +) +from factor_implementation.share_modules.utils import get_data_folder_intro +from core.utils import multiprocessing_wrapper + +if TYPE_CHECKING: + from factor_implementation.evolving.evolvable_subjects import ( + FactorImplementationList, + ) + from factor_implementation.evolving.knowledge_management import ( + FactorImplementationQueriedKnowledge, + FactorImplementationQueriedKnowledgeV1, + ) + + +class MultiProcessEvolvingStrategy(EvolvingStrategy): + @abstractmethod + def implement_one_factor( + self, + target_task: FactorImplementationTask, + queried_knowledge: QueriedKnowledge = None, + ) -> FactorImplementation: + raise NotImplementedError + + def evolve( + self, + *, + evo: FactorImplementationList, + queried_knowledge: FactorImplementationQueriedKnowledge | None = None, + **kwargs, + ) -> FactorImplementationList: + new_evo = deepcopy(evo) + new_evo.corresponding_implementations = [None for _ in new_evo.target_factor_tasks] + + to_be_finished_task_index = [] + for index, target_factor_task in enumerate(new_evo.target_factor_tasks): + target_factor_task_desc = target_factor_task.get_factor_information() + if target_factor_task_desc in queried_knowledge.success_task_to_knowledge_dict: + new_evo.corresponding_implementations[index] = queried_knowledge.success_task_to_knowledge_dict[ + target_factor_task_desc + ].implementation + elif ( + target_factor_task_desc not in queried_knowledge.success_task_to_knowledge_dict + and target_factor_task_desc not in queried_knowledge.failed_task_info_set + ): + to_be_finished_task_index.append(index) + if FactorImplementSettings().implementation_factors_per_round < len(to_be_finished_task_index): + to_be_finished_task_index = random.sample( + to_be_finished_task_index, + FactorImplementSettings().implementation_factors_per_round, + ) + + result = multiprocessing_wrapper( + [ + (self.implement_one_factor, (new_evo.target_factor_tasks[target_index], queried_knowledge)) + for target_index in to_be_finished_task_index + ], + n=FactorImplementSettings().evo_multi_proc_n, + ) + + for index, target_index in enumerate(to_be_finished_task_index): + new_evo.corresponding_implementations[target_index] = result[index] + + # for target_index in to_be_finished_task_index: + # new_evo.corresponding_implementations[target_index] = self.implement_one_factor( + # new_evo.target_factor_tasks[target_index], queried_knowledge + # ) + + return new_evo + + +class FactorEvolvingStrategy(MultiProcessEvolvingStrategy): + def implement_one_factor( + self, + target_task: FactorImplementationTask, + queried_knowledge: FactorImplementationQueriedKnowledgeV1 = None, + ) -> FactorImplementation: + factor_information_str = target_task.get_factor_information() + + if queried_knowledge is not None and factor_information_str in queried_knowledge.success_task_to_knowledge_dict: + return queried_knowledge.success_task_to_knowledge_dict[factor_information_str].implementation + elif queried_knowledge is not None and factor_information_str in queried_knowledge.failed_task_info_set: + return None + else: + queried_similar_successful_knowledge = ( + queried_knowledge.working_task_to_similar_successful_knowledge_dict[factor_information_str] + if queried_knowledge is not None + else [] + ) + queried_former_failed_knowledge = ( + queried_knowledge.working_task_to_former_failed_knowledge_dict[factor_information_str] + if queried_knowledge is not None + else [] + ) + + queried_former_failed_knowledge_to_render = queried_former_failed_knowledge + + system_prompt = Template( + FactorImplementationPrompts()["evolving_strategy_factor_implementation_v1_system"], + ).render( + data_info=get_data_folder_intro(), + queried_former_failed_knowledge=queried_former_failed_knowledge_to_render, + ) + session = APIBackend(use_chat_cache=False).build_chat_session( + session_system_prompt=system_prompt, + ) + + queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge + while True: + user_prompt = ( + Template( + FactorImplementationPrompts()["evolving_strategy_factor_implementation_v1_user"], + ) + .render( + factor_information_str=factor_information_str, + queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render, + ) + .strip("\n") + ) + if ( + session.build_chat_completion_message_and_calculate_token( + user_prompt, + ) + < FactorImplementSettings().chat_token_limit + ): + break + elif len(queried_former_failed_knowledge_to_render) > 1: + queried_former_failed_knowledge_to_render = queried_former_failed_knowledge_to_render[1:] + elif len(queried_similar_successful_knowledge_to_render) > 1: + queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge_to_render[1:] + # print( + # f"length of queried_similar_successful_knowledge_to_render: {len(queried_similar_successful_knowledge_to_render)}, length of queried_former_failed_knowledge_to_render: {len(queried_former_failed_knowledge_to_render)}" + # ) + + code = json.loads( + session.build_chat_completion( + user_prompt=user_prompt, + json_mode=True, + ), + )["code"] + # ast.parse(code) + factor_implementation = FileBasedFactorImplementation( + target_task, + code, + ) + + return factor_implementation + + +class FactorEvolvingStrategyWithGraph(MultiProcessEvolvingStrategy): + def implement_one_factor( + self, + target_task: FactorImplementationTask, + queried_knowledge, + ) -> FactorImplementation: + error_summary = FactorImplementSettings().v2_error_summary + target_factor_task_information = target_task.get_factor_information() + + if ( + queried_knowledge is not None + and target_factor_task_information in queried_knowledge.success_task_to_knowledge_dict + ): + return queried_knowledge.success_task_to_knowledge_dict[target_factor_task_information].implementation + elif queried_knowledge is not None and target_factor_task_information in queried_knowledge.failed_task_info_set: + return None + else: + queried_similar_component_knowledge = ( + queried_knowledge.component_with_success_task[target_factor_task_information] + if queried_knowledge is not None + else [] + ) # A list, [success task implement knowledge] + + queried_similar_error_knowledge = ( + queried_knowledge.error_with_success_task[target_factor_task_information] + if queried_knowledge is not None + else {} + ) # A dict, {{error_type:[[error_imp_knowledge, success_imp_knowledge],...]},...} + + queried_former_failed_knowledge = ( + queried_knowledge.former_traces[target_factor_task_information] if queried_knowledge is not None else [] + ) + + queried_former_failed_knowledge_to_render = queried_former_failed_knowledge + + system_prompt = Template( + FactorImplementationPrompts()["evolving_strategy_factor_implementation_v1_system"], + ).render( + data_info=get_data_folder_intro(), + queried_former_failed_knowledge=queried_former_failed_knowledge_to_render, + ) + + session = APIBackend(use_chat_cache=False).build_chat_session( + session_system_prompt=system_prompt, + ) + + queried_similar_component_knowledge_to_render = queried_similar_component_knowledge + queried_similar_error_knowledge_to_render = queried_similar_error_knowledge + error_summary_critics = "" + while True: + if ( + error_summary + and len(queried_similar_error_knowledge_to_render) != 0 + and len(queried_former_failed_knowledge_to_render) != 0 + ): + error_summary_system_prompt = ( + Template(FactorImplementationPrompts()["evolving_strategy_error_summary_v2_system"]) + .render( + factor_information_str=target_factor_task_information, + code_and_feedback=queried_former_failed_knowledge_to_render[ + -1 + ].get_implementation_and_feedback_str(), + ) + .strip("\n") + ) + session_summary = APIBackend(use_chat_cache=False).build_chat_session( + session_system_prompt=error_summary_system_prompt, + ) + while True: + error_summary_user_prompt = ( + Template(FactorImplementationPrompts()["evolving_strategy_error_summary_v2_user"]) + .render( + queried_similar_component_knowledge=queried_similar_component_knowledge_to_render, + ) + .strip("\n") + ) + if ( + session_summary.build_chat_completion_message_and_calculate_token(error_summary_user_prompt) + < FactorImplementSettings().chat_token_limit + ): + break + elif len(queried_similar_error_knowledge_to_render) > 0: + queried_similar_error_knowledge_to_render = queried_similar_error_knowledge_to_render[:-1] + error_summary_critics = session_summary.build_chat_completion( + user_prompt=error_summary_user_prompt, + json_mode=False, + ) + + user_prompt = ( + Template( + FactorImplementationPrompts()["evolving_strategy_factor_implementation_v2_user"], + ) + .render( + factor_information_str=target_factor_task_information, + queried_similar_component_knowledge=queried_similar_component_knowledge_to_render, + queried_similar_error_knowledge=queried_similar_error_knowledge_to_render, + error_summary=error_summary, + error_summary_critics=error_summary_critics, + ) + .strip("\n") + ) + if ( + session.build_chat_completion_message_and_calculate_token( + user_prompt, + ) + < FactorImplementSettings().chat_token_limit + ): + break + elif len(queried_former_failed_knowledge_to_render) > 1: + queried_former_failed_knowledge_to_render = queried_former_failed_knowledge_to_render[1:] + elif len(queried_similar_component_knowledge_to_render) > len( + queried_similar_error_knowledge_to_render, + ): + queried_similar_component_knowledge_to_render = queried_similar_component_knowledge_to_render[:-1] + elif len(queried_similar_error_knowledge_to_render) > 0: + queried_similar_error_knowledge_to_render = queried_similar_error_knowledge_to_render[:-1] + + # print( + # len(queried_similar_component_knowledge_to_render), + # len(queried_similar_error_knowledge_to_render), + # len(queried_former_failed_knowledge_to_render), + # ) + + response = session.build_chat_completion( + user_prompt=user_prompt, + json_mode=True, + ) + code = json.loads(response)["code"] + factor_implementation = FileBasedFactorImplementation(target_task, code) + return factor_implementation diff --git a/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py b/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py new file mode 100644 index 00000000..fd60b187 --- /dev/null +++ b/rdagent/factor_implementation/evolving/factor_implementation_evolving_cli.py @@ -0,0 +1,311 @@ +import json +import pickle +import subprocess +from pathlib import Path + +import pandas as pd +from fire.core import Fire +from tqdm import tqdm + +from core.evolving_framework import EvoAgent, KnowledgeBase +from factor_implementation.evolving.evaluators import ( + FactorImplementationEvaluatorV1, + FactorImplementationsMultiEvaluator, +) +from factor_implementation.evolving.evolvable_subjects import ( + FactorImplementationList, +) +from factor_implementation.evolving.evolving_strategy import ( + FactorEvolvingStrategy, + FactorEvolvingStrategyWithGraph, +) +from factor_implementation.evolving.knowledge_management import ( + FactorImplementationGraphKnowledgeBase, + FactorImplementationGraphRAGStrategy, + FactorImplementationKnowledgeBaseV1, + FactorImplementationRAGStrategyV1, +) +from factor_implementation.share_modules.factor import ( + FactorImplementationTask, + FileBasedFactorImplementation, +) +from core.utils import multiprocessing_wrapper + +ALPHA101_INIT_COMPONENTS = [ + "1. abs(): absolute value to certain columns", + "2. log(): log value to certain columns", + "3. sign(): sign value to certain columns", + "4. add_two_columns(): add two columns", + "5. minus_two_columns(): minus two columns", + "6. times_two_columns(): times two columns", + "7. divide_two_columns(): divide two columns", + "8. add_value_to_columns(): add value to columns", + "9. minus_value_to_columns(): minus value to columns", + "10. rank(): cross-sectional rank value to columns", + "11. delay(): value of each data d days ago", + "12. correlation(): time-serial correlation of column_left and column_right for the past d days", + "13. covariance(): time-serial covariance of column_left and column_right for the past d days", + "14. scale_to_a(): scale the columns to sum(abs(x)) is a", + "15. delta(): today’s value of x minus the value of x d days ago", + "16. signedpower(): x^a", + "17. decay_linear(): weighted moving average over the past d days with linearly decaying weights d, d – 1, …, 1 (rescaled to sum up to 1)", + "18. indneutralize(): x cross-sectionally neutralized against groups g (subindustries, industries, sectors, etc.), i.e., x is cross-sectionally demeaned within each group g", + "19. ts_min(): time-series min over the past d days, operator min applied across the time-series for the past d days; non-integer number of days d is converted to floor(d)", + "20. ts_max(): time-series max over the past d days, operator max applied across the time-series for the past d days; non-integer number of days d is converted to floor(d)", + "21. ts_argmax(): which day ts_max(x, d) occurred on", + "22. ts_argmin(): which day ts_min(x, d) occurred on", + "23. ts_rank(): time-series rank in the past d days", + "24. min(): ts_min(x, d)", + "25. max(): ts_max(x, d)", + "26. sum(): time-series sum over the past d days", + "27. product(): time-series product over the past d days", + "28. stddev(): moving time-series standard deviation over the past d days", +] + + +class FactorImplementationEvolvingCli: + # TODO: we should use polymorphism to load knowledge base, strategies instead of evolving_version + # TODO: Can we refactor FactorImplementationEvolvingCli into a learning framework to differentiate our learning paradiagm with other ones by iteratively retrying? + def __init__(self, evolving_version=2) -> None: + self.evolving_version = evolving_version + self.knowledge_base = None + self.latest_factor_implementations = None + + def run_evolving_framework( + self, + factor_implementations: FactorImplementationList, + factor_knowledge_base: KnowledgeBase, + max_loops: int = 20, + with_knowledge: bool = True, + with_feedback: bool = True, + knowledge_self_gen: bool = True, + ) -> FactorImplementationList: + """ + Main target: Implement factors. + The system also leverages the former knowledge to help implement the factors. Also, new knowledge might be generated during the implementation to help the following implementation. + The gt_code and gt_value in the Factor instance is used to evaluate the implementation, and the feedback is used to generate high-quality knowledge which helps the agent to evolve. + """ + es = FactorEvolvingStrategyWithGraph() if self.evolving_version == 2 else FactorEvolvingStrategy() + rag = ( + FactorImplementationGraphRAGStrategy(factor_knowledge_base) + if self.evolving_version == 2 + else FactorImplementationRAGStrategyV1(factor_knowledge_base) + ) + factor_evaluator = FactorImplementationsMultiEvaluator(FactorImplementationEvaluatorV1()) + ea = EvoAgent(es, rag=rag) + + for _ in tqdm(range(max_loops), "Implementing factors"): + factor_implementations = ea.step_evolving( + factor_implementations, + factor_evaluator, + with_knowledge=with_knowledge, + with_feedback=with_feedback, + knowledge_self_gen=knowledge_self_gen, + ) + return factor_implementations + + def load_or_init_knowledge_base(self, former_knowledge_base_path: Path = None, component_init_list: list = []): + if former_knowledge_base_path is not None and former_knowledge_base_path.exists(): + factor_knowledge_base = pickle.load(open(former_knowledge_base_path, "rb")) + if self.evolving_version == 1 and not isinstance( + factor_knowledge_base, FactorImplementationKnowledgeBaseV1 + ): + raise ValueError("The former knowledge base is not compatible with the current version") + elif self.evolving_version == 2 and not isinstance( + factor_knowledge_base, + FactorImplementationGraphKnowledgeBase, + ): + raise ValueError("The former knowledge base is not compatible with the current version") + else: + factor_knowledge_base = ( + FactorImplementationGraphKnowledgeBase( + init_component_list=component_init_list, + ) + if self.evolving_version == 2 + else FactorImplementationKnowledgeBaseV1() + ) + return factor_knowledge_base + + def implement_factors( + self, + factor_implementations: FactorImplementationList, + former_knowledge_base_path: Path = None, + new_knowledge_base_path: Path = None, + component_init_list: list = [], + max_loops: int = 20, + ): + factor_knowledge_base = self.load_or_init_knowledge_base( + former_knowledge_base_path=former_knowledge_base_path, + component_init_list=component_init_list, + ) + + new_factor_implementations = self.run_evolving_framework( + factor_implementations=factor_implementations, + factor_knowledge_base=factor_knowledge_base, + max_loops=max_loops, + with_knowledge=True, + with_feedback=True, + knowledge_self_gen=True, + ) + if new_knowledge_base_path is not None: + pickle.dump(factor_knowledge_base, open(new_knowledge_base_path, "wb")) + self.knowledge_base = factor_knowledge_base + self.latest_factor_implementations = factor_implementations + return new_factor_implementations + + def _read_alpha101_factors( + self, + alpha101_evo_subs_path: Path = None, + alpha101_data_path=Path().cwd() / "git_ignore_folder" / "alpha101_related_files", + start_index=0, + end_index=32, + read_gt_factors=True, + ) -> FactorImplementationList: + """ + Read the alpha101 factors from the alpha101_related_files folder + """ + if alpha101_evo_subs_path is not None and alpha101_evo_subs_path.exists(): + factor_implementations = pickle.load(open(alpha101_evo_subs_path, "rb")) + else: + target_factor_plain_list = json.load( + open(alpha101_data_path / "target_factor_task_list.json"), + ) + name_to_code = json.load(open(alpha101_data_path / "name_to_code.json")) + gt_df = pd.read_hdf(alpha101_data_path / "gt_filtered.h5") + + # First read the target factor task + target_factor_tasks = [] + for factor_list_item in target_factor_plain_list: + target_factor_tasks.append( + FactorImplementationTask( + factor_name=factor_list_item[0], + factor_description=factor_list_item[1], + factor_formulation=factor_list_item[2], + factor_formulation_description=factor_list_item[3], + ), + ) + + # Second read the gt factor implementations + corresponding_gt_implementations = [] + for factor_task in target_factor_tasks: + name = factor_task.factor_name + gt_code = name_to_code[name] + gt_value = gt_df.loc(axis=1)[[name]] + corresponding_gt_implementations.append( + FileBasedFactorImplementation( + code=gt_code, + executed_factor_value_dataframe=gt_value, + target_task=factor_task, + ), + ) + + # Finally generate the factor implementations as evolvable subjects + factor_implementations = FactorImplementationList( + target_factor_tasks=target_factor_tasks, + corresponding_gt_implementations=(corresponding_gt_implementations if read_gt_factors else None), + ) + + factor_implementations.target_factor_tasks = factor_implementations.target_factor_tasks[start_index:end_index] + factor_implementations.corresponding_gt_implementations = ( + factor_implementations.corresponding_gt_implementations[start_index:end_index] if read_gt_factors else None + ) + + return factor_implementations + + def implement_alpha101( + self, + max_loops=30, + ) -> FactorImplementationList: + """ + Implement the alpha101 factors to gather knowledge TODO: implement the code + """ + factor_implementations = self._read_alpha101_factors( + alpha101_evo_subs_path=Path.cwd() / "alpha101_evo_subs.pkl", + start_index=0, + end_index=64, + read_gt_factors=True, + ) + self.implement_factors( + factor_implementations, + former_knowledge_base_path=Path.cwd() + / f"alpha101_knowledge_base_v{self.evolving_version}_project_product.pkl", + new_knowledge_base_path=Path.cwd() + / f"alpha101_knowledge_base_v{self.evolving_version}_project_product.pkl", + component_init_list=ALPHA101_INIT_COMPONENTS, + max_loops=100, + ) + + factor_implementations = self._read_alpha101_factors( + alpha101_evo_subs_path=Path.cwd() / "alpha101_evo_subs.pkl", + start_index=64, + end_index=96, + read_gt_factors=False, + ) + final_imp = self.implement_factors( + factor_implementations, + former_knowledge_base_path=Path.cwd() + / f"alpha101_knowledge_base_v{self.evolving_version}_project_product.pkl", + new_knowledge_base_path=Path.cwd() + / f"alpha101_knowledge_base_v{self.evolving_version}_self_evolving_project_product.pkl", + component_init_list=ALPHA101_INIT_COMPONENTS, + max_loops=10, + ) + final_imp.corresponding_gt_implementations = factor_implementations = self._read_alpha101_factors( + alpha101_evo_subs_path=Path.cwd() / "alpha101_evo_subs.pkl", + start_index=64, + end_index=96, + read_gt_factors=True, + ).corresponding_gt_implementations + + feedbacks = FactorImplementationsMultiEvaluator().evaluate(final_imp) + print([feedback.final_decision if feedback is not None else None for feedback in feedbacks].count(True)) + + def implement_amc( + self, evo_sub_path_str, former_knowledge_base_path_str, implementation_dump_path_str, slice_index + ): + factor_implementations: FactorImplementationList = pickle.load(open(evo_sub_path_str, "rb")) + factor_implementations.target_factor_tasks = factor_implementations.target_factor_tasks[ + slice_index * 16 : slice_index * 16 + 16 + ] + if len(factor_implementations.target_factor_tasks) == 0: + return + if Path(implementation_dump_path_str).exists(): + return + factor_implementations = self.implement_factors( + factor_implementations, + former_knowledge_base_path=Path(former_knowledge_base_path_str), + component_init_list=ALPHA101_INIT_COMPONENTS, + max_loops=10, + ) + pickle.dump(factor_implementations, open(implementation_dump_path_str, "wb")) + + def execute_command(self, command, cwd): + print(command, cwd) + try: + subprocess.check_output( + command, + shell=True, + cwd=cwd, + ) + except subprocess.CalledProcessError as e: + print(e.output.decode()) + + def multi_inference_amc_factors(self, type): + slice_count = {"price_volume": 35, "fundamental": 24, "high_frequency": 16}[type] + res = multiprocessing_wrapper( + [ + ( + self.execute_command, + ( + f"python src/scripts/factor_implementation/baselines/evolving/factor_implementation_evolving_cli.py implement_amc ./{type}_factors.pkl ./knowledge_base_v2_with_alpha101_and_10_factors.pkl ./inference_amc_factors_{type}_{slice}.pkl {slice}", + Path.cwd(), + ), + ) + for slice in range(slice_count) + ], + n=2, + ) + + +if __name__ == "__main__": + Fire(FactorImplementationEvolvingCli) diff --git a/rdagent/factor_implementation/evolving/knowledge_management.py b/rdagent/factor_implementation/evolving/knowledge_management.py new file mode 100644 index 00000000..f76ca63f --- /dev/null +++ b/rdagent/factor_implementation/evolving/knowledge_management.py @@ -0,0 +1,905 @@ +from __future__ import annotations + +import copy +import json +import random +import re +from itertools import combinations +from pathlib import Path +from typing import Union + +from jinja2 import Template + +from core.evolving_framework import ( + EvolvableSubjects, + EvoStep, + Knowledge, + KnowledgeBase, + QueriedKnowledge, + RAGStrategy, +) +from finco.graph import UndirectedGraph, UndirectedNode +from oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list +from core.log import FinCoLog +from factor_implementation.evolving.evaluators import ( + FactorImplementationSingleFeedback, +) +from factor_implementation.share_modules.conf import FactorImplementSettings +from factor_implementation.share_modules.factor import ( + FactorImplementation, + FactorImplementationTask, +) +from factor_implementation.share_modules.prompt import ( + FactorImplementationPrompts, +) + + +class FactorImplementationKnowledge(Knowledge): + def __init__( + self, + target_task: FactorImplementationTask, + implementation: FactorImplementation, + feedback: FactorImplementationSingleFeedback, + ) -> None: + """ + Initialize a FactorKnowledge object. The FactorKnowledge object is used to store a factor implementation without the ground truth code and value. + + Args: + factor (Factor): The factor object associated with the KnowledgeManagement. + + Returns: + None + """ + self.target_task = target_task + self.implementation = implementation + self.feedback = feedback + + def get_implementation_and_feedback_str(self) -> str: + return f"""------------------Factor implementation code:------------------ +{self.implementation.code} +------------------Factor implementation feedback:------------------ +{self.feedback!s} +""" + + +class FactorImplementationQueriedKnowledge(QueriedKnowledge): + def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_set: set = set()) -> None: + self.success_task_to_knowledge_dict = success_task_to_knowledge_dict + self.failed_task_info_set = failed_task_info_set + + +class FactorImplementationKnowledgeBaseV1(KnowledgeBase): + def __init__(self) -> None: + self.implementation_trace: dict[str, FactorImplementationKnowledge] = dict() + self.success_task_info_set: set[str] = set() + + self.task_to_embedding = dict() + + def query(self) -> QueriedKnowledge | None: + """ + Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy. + """ + raise NotImplementedError + + +class FactorImplementationQueriedKnowledgeV1(FactorImplementationQueriedKnowledge): + def __init__(self) -> None: + self.working_task_to_former_failed_knowledge_dict = dict() + self.working_task_to_similar_successful_knowledge_dict = dict() + super().__init__() + + +class FactorImplementationRAGStrategyV1(RAGStrategy): + def __init__(self, knowledgebase: FactorImplementationKnowledgeBaseV1) -> None: + super().__init__(knowledgebase) + self.current_generated_trace_count = 0 + + def generate_knowledge( + self, + evolving_trace: list[EvoStep], + *, + return_knowledge: bool = False, + ) -> Knowledge | None: + if len(evolving_trace) == self.current_generated_trace_count: + return + else: + for trace_index in range( + self.current_generated_trace_count, + len(evolving_trace), + ): + evo_step = evolving_trace[trace_index] + implementations = evo_step.evolvable_subjects + feedback = evo_step.feedback + for task_index in range(len(implementations.target_factor_tasks)): + target_task = implementations.target_factor_tasks[task_index] + target_task_information = target_task.get_factor_information() + implementation = implementations.corresponding_implementations[task_index] + single_feedback = feedback[task_index] + if single_feedback is None: + continue + single_knowledge = FactorImplementationKnowledge( + target_task=target_task, + implementation=implementation, + feedback=single_feedback, + ) + if target_task_information not in self.knowledgebase.success_task_info_set: + self.knowledgebase.implementation_trace.setdefault( + target_task_information, + [], + ).append(single_knowledge) + + if single_feedback.final_decision == True: + self.knowledgebase.success_task_info_set.add( + target_task_information, + ) + self.current_generated_trace_count = len(evolving_trace) + + def query( + self, + evo: EvolvableSubjects, + evolving_trace: list[EvoStep], + ) -> QueriedKnowledge | None: + v1_query_former_trace_limit = FactorImplementSettings().v1_query_former_trace_limit + v1_query_similar_success_limit = FactorImplementSettings().v1_query_similar_success_limit + fail_task_trial_limit = FactorImplementSettings().fail_task_trial_limit + + queried_knowledge = FactorImplementationQueriedKnowledgeV1() + for target_factor_task in evo.target_factor_tasks: + target_factor_task_information = target_factor_task.get_factor_information() + if target_factor_task_information in self.knowledgebase.success_task_info_set: + queried_knowledge.success_task_to_knowledge_dict[target_factor_task_information] = ( + self.knowledgebase.implementation_trace[target_factor_task_information][-1] + ) + else: + if ( + len( + self.knowledgebase.implementation_trace.setdefault( + target_factor_task_information, + [], + ), + ) + >= fail_task_trial_limit + ): + queried_knowledge.failed_task_info_set.add(target_factor_task_information) + else: + queried_knowledge.working_task_to_former_failed_knowledge_dict[target_factor_task_information] = ( + self.knowledgebase.implementation_trace.setdefault( + target_factor_task_information, + [], + )[-v1_query_former_trace_limit:] + ) + + knowledge_base_success_task_list = list( + self.knowledgebase.success_task_info_set, + ) + similarity = calculate_embedding_distance_between_str_list( + [target_factor_task_information], + knowledge_base_success_task_list, + )[0] + similar_indexes = sorted( + range(len(similarity)), + key=lambda i: similarity[i], + reverse=True, + )[:v1_query_similar_success_limit] + similar_successful_knowledge = [ + self.knowledgebase.implementation_trace.setdefault( + knowledge_base_success_task_list[index], + [], + )[-1] + for index in similar_indexes + ] + queried_knowledge.working_task_to_similar_successful_knowledge_dict[ + target_factor_task_information + ] = similar_successful_knowledge + return queried_knowledge + + +class FactorImplementationQueriedGraphKnowledge(FactorImplementationQueriedKnowledge): + # Aggregation of knowledge + def __init__( + self, + former_traces: dict = {}, + component_with_success_task: dict = {}, + error_with_success_task: dict = {}, + **kwargs, + ) -> None: + self.former_traces = former_traces + self.component_with_success_task = component_with_success_task + self.error_with_success_task = error_with_success_task + super().__init__(**kwargs) + + +class FactorImplementationGraphRAGStrategy(RAGStrategy): + def __init__(self, knowledgebase: FactorImplementationGraphKnowledgeBase) -> None: + super().__init__(knowledgebase) + self.current_generated_trace_count = 0 + self.prompt = FactorImplementationPrompts() + + def generate_knowledge( + self, + evolving_trace: list[EvoStep], + *, + return_knowledge: bool = False, + ) -> Knowledge | None: + if len(evolving_trace) == self.current_generated_trace_count: + return None + + else: + for trace_index in range(self.current_generated_trace_count, len(evolving_trace)): + evo_step = evolving_trace[trace_index] + implementations = evo_step.evolvable_subjects + feedback = evo_step.feedback + for task_index in range(len(implementations.target_factor_tasks)): + single_feedback = feedback[task_index] + target_task = implementations.target_factor_tasks[task_index] + target_task_information = target_task.get_factor_information() + implementation = implementations.corresponding_implementations[task_index] + single_feedback = feedback[task_index] + if single_feedback is None: + continue + single_knowledge = FactorImplementationKnowledge( + target_task=target_task, + implementation=implementation, + feedback=single_feedback, + ) + if ( + target_task_information not in self.knowledgebase.success_task_to_knowledge_dict + and implementation is not None + ): + self.knowledgebase.working_trace_knowledge.setdefault(target_task_information, []).append( + single_knowledge, + ) # save to working trace + if single_feedback.final_decision == True: + self.knowledgebase.success_task_to_knowledge_dict.setdefault( + target_task_information, + single_knowledge, + ) + # Do summary for the last step and update the knowledge graph + self.knowledgebase.update_success_task( + target_task_information, + ) + else: + # generate error node and store into knowledge base + error_analysis_result = [] + if not single_feedback.value_generated_flag: + error_analysis_result = self.analyze_error( + single_feedback.execution_feedback, + feedback_type="execution", + ) + else: + error_analysis_result = self.analyze_error( + single_feedback.factor_value_feedback, + feedback_type="value", + ) + self.knowledgebase.working_trace_error_analysis.setdefault( + target_task_information, + [], + ).append( + error_analysis_result, + ) # save to working trace error record, for graph update + + self.current_generated_trace_count = len(evolving_trace) + return None + + def query(self, evo: EvolvableSubjects, evolving_trace: list[EvoStep]) -> QueriedKnowledge | None: + conf_knowledge_sampler = FactorImplementSettings().v2_knowledge_sampler + factor_implementation_queried_graph_knowledge = FactorImplementationQueriedGraphKnowledge( + success_task_to_knowledge_dict=self.knowledgebase.success_task_to_knowledge_dict, + ) + + factor_implementation_queried_graph_knowledge = self.former_trace_query( + evo, + factor_implementation_queried_graph_knowledge, + FactorImplementSettings().v2_query_former_trace_limit, + ) + factor_implementation_queried_graph_knowledge = self.component_query( + evo, + factor_implementation_queried_graph_knowledge, + FactorImplementSettings().v2_query_component_limit, + knowledge_sampler=conf_knowledge_sampler, + ) + factor_implementation_queried_graph_knowledge = self.error_query( + evo, + factor_implementation_queried_graph_knowledge, + FactorImplementSettings().v2_query_error_limit, + knowledge_sampler=conf_knowledge_sampler, + ) + return factor_implementation_queried_graph_knowledge + + def analyze_component( + self, + target_factor_task_information, + ) -> list[UndirectedNode]: # Hardcode: certain component nodes + all_component_nodes = self.knowledgebase.graph.get_all_nodes_by_label_list(["component"]) + all_component_content = "" + for _, component_node in enumerate(all_component_nodes): + all_component_content += f"{component_node.content}, \n" + analyze_component_system_prompt = Template(self.prompt["analyze_component_prompt_v1_system"]).render( + all_component_content=all_component_content, + ) + + analyze_component_user_prompt = target_factor_task_information + try: + component_no_list = json.loads( + APIBackend().build_messages_and_create_chat_completion( + system_prompt=analyze_component_system_prompt, + user_prompt=analyze_component_user_prompt, + json_mode=True, + ), + )["component_no_list"] + return [all_component_nodes[index - 1] for index in sorted(list(set(component_no_list)))] + except: + FinCoLog.warning("Error when analyzing components.") + analyze_component_user_prompt = "Your response is not a valid component index list." + + return [] + + def analyze_error( + self, + single_feedback, + feedback_type="execution", + ) -> list[ + UndirectedNode | str + ]: # Hardcode: Raised errors, existed error nodes + not existed error nodes(here, they are strs) + if feedback_type == "execution": + match = re.search( + r'File "(?P.+)", line (?P\d+), in (?P.+)\n\s+(?P.+)\n(?P\w+): (?P.+)', + single_feedback, + ) + if match: + error_details = match.groupdict() + # last_traceback = f'File "{error_details["file"]}", line {error_details["line"]}, in {error_details["function"]}\n {error_details["error_line"]}' + error_type = error_details["error_type"] + error_line = error_details["error_line"] + error_contents = [f"ErrorType: {error_type}" + "\n" + f"Error line: {error_line}"] + else: + error_contents = ["Undefined Error"] + elif feedback_type == "value": # value check error + value_check_types = r"The source dataframe and the ground truth dataframe have different rows count.|The source dataframe and the ground truth dataframe have different index.|Some values differ by more than the tolerance of 1e-6.|No sufficient correlation found when shifting up|Something wrong happens when naming the multi indices of the dataframe." + error_contents = re.findall(value_check_types, single_feedback) + else: + error_contents = ["Undefined Error"] + + all_error_nodes = self.knowledgebase.graph.get_all_nodes_by_label_list(["error"]) + if not len(all_error_nodes): + return error_contents + else: + error_list = [] + for error_content in error_contents: + for error_node in all_error_nodes: + if error_content == error_node.content: + error_list.append(error_node) + else: + error_list.append(error_content) + if error_list[-1] in error_list[:-1]: + error_list.pop() + + return error_list + + def former_trace_query( + self, + evo: EvolvableSubjects, + factor_implementation_queried_graph_knowledge: FactorImplementationQueriedGraphKnowledge, + v2_query_former_trace_limit: int = 5, + ) -> Union[QueriedKnowledge, set]: + """ + Query the former trace knowledge of the working trace, and find all the failed task information which tried more than fail_task_trial_limit times + """ + fail_task_trial_limit = FactorImplementSettings().fail_task_trial_limit + + for target_factor_task in evo.target_factor_tasks: + target_factor_task_information = target_factor_task.get_factor_information() + if ( + target_factor_task_information not in self.knowledgebase.success_task_to_knowledge_dict + and target_factor_task_information in self.knowledgebase.working_trace_knowledge + and len(self.knowledgebase.working_trace_knowledge[target_factor_task_information]) + >= fail_task_trial_limit + ): + factor_implementation_queried_graph_knowledge.failed_task_info_set.add(target_factor_task_information) + + if ( + target_factor_task_information not in self.knowledgebase.success_task_to_knowledge_dict + and target_factor_task_information + not in factor_implementation_queried_graph_knowledge.failed_task_info_set + and target_factor_task_information in self.knowledgebase.working_trace_knowledge + ): + former_trace_knowledge = copy.copy( + self.knowledgebase.working_trace_knowledge[target_factor_task_information], + ) + # in former trace query we will delete the right trace in the following order:[..., value_generated_flag is True, value_generated_flag is False, ...] + # because we think this order means a deterioration of the trial (like a wrong gradient descent) + current_index = 1 + while current_index < len(former_trace_knowledge): + if ( + not former_trace_knowledge[current_index].feedback.value_generated_flag + and former_trace_knowledge[current_index - 1].feedback.value_generated_flag + ): + former_trace_knowledge.pop(current_index) + else: + current_index += 1 + + factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = ( + former_trace_knowledge[-v2_query_former_trace_limit:] + ) + else: + factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = [] + + return factor_implementation_queried_graph_knowledge + + def component_query( + self, + evo: EvolvableSubjects, + factor_implementation_queried_graph_knowledge: FactorImplementationQueriedGraphKnowledge, + v2_query_component_limit: int = 5, + knowledge_sampler: float = 1.0, + ) -> QueriedKnowledge | None: + # queried_component_knowledge = FactorImplementationQueriedGraphComponentKnowledge() + for target_factor_task in evo.target_factor_tasks: + target_factor_task_information = target_factor_task.get_factor_information() + if ( + target_factor_task_information in self.knowledgebase.success_task_to_knowledge_dict + or target_factor_task_information in factor_implementation_queried_graph_knowledge.failed_task_info_set + ): + factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] = [] + else: + if target_factor_task_information not in self.knowledgebase.task_to_component_nodes: + self.knowledgebase.task_to_component_nodes[target_factor_task_information] = self.analyze_component( + target_factor_task_information, + ) + + component_analysis_result = self.knowledgebase.task_to_component_nodes[target_factor_task_information] + + if len(component_analysis_result) > 1: + task_des_node_list = self.knowledgebase.graph_query_by_intersection( + component_analysis_result, + constraint_labels=["task_description"], + ) + single_component_constraint = (v2_query_component_limit // len(component_analysis_result)) + 1 + else: + task_des_node_list = [] + single_component_constraint = v2_query_component_limit + factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] = [] + for component_node in component_analysis_result: + # Reverse iterate, a trade-off with intersection search + count = 0 + for task_des_node in self.knowledgebase.graph_query_by_node( + node=component_node, + step=1, + constraint_labels=["task_description"], + block=True, + )[::-1]: + if task_des_node not in task_des_node_list: + task_des_node_list.append(task_des_node) + count += 1 + if count >= single_component_constraint: + break + + for node in task_des_node_list: + for searched_node in self.knowledgebase.graph_query_by_node( + node=node, + step=50, + constraint_labels=[ + "task_success_implement", + ], + block=True, + ): + if searched_node.label == "task_success_implement": + target_knowledge = self.knowledgebase.node_to_implementation_knowledge_dict[ + searched_node.id + ] + if ( + target_knowledge + not in factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] + ): + factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ].append(target_knowledge) + + # finally add embedding related knowledge + knowledge_base_success_task_list = list(self.knowledgebase.success_task_to_knowledge_dict) + + similarity = calculate_embedding_distance_between_str_list( + [target_factor_task_information], + knowledge_base_success_task_list, + )[0] + similar_indexes = sorted( + range(len(similarity)), + key=lambda i: similarity[i], + reverse=True, + ) + embedding_similar_successful_knowledge = [ + self.knowledgebase.success_task_to_knowledge_dict[knowledge_base_success_task_list[index]] + for index in similar_indexes + ] + for knowledge in embedding_similar_successful_knowledge: + if ( + knowledge + not in factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] + ): + factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ].append(knowledge) + + if knowledge_sampler > 0: + factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] = [ + knowledge + for knowledge in factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] + if random.uniform(0, 1) <= knowledge_sampler + ] + + # Make sure no less than half of the knowledge are from GT + queried_knowledge_list = factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] + queried_from_gt_knowledge_list = [ + knowledge + for knowledge in queried_knowledge_list + if knowledge.feedback is not None and knowledge.feedback.final_decision_based_on_gt == True + ] + queried_without_gt_knowledge_list = [ + knowledge + for knowledge in queried_knowledge_list + if knowledge.feedback is not None and knowledge.feedback.final_decision_based_on_gt == False + ] + queried_from_gt_knowledge_count = max( + min(v2_query_component_limit // 2, len(queried_from_gt_knowledge_list)), + v2_query_component_limit - len(queried_without_gt_knowledge_list), + ) + factor_implementation_queried_graph_knowledge.component_with_success_task[ + target_factor_task_information + ] = ( + queried_from_gt_knowledge_list[:queried_from_gt_knowledge_count] + + queried_without_gt_knowledge_list[: v2_query_component_limit - queried_from_gt_knowledge_count] + ) + + return factor_implementation_queried_graph_knowledge + + def error_query( + self, + evo: EvolvableSubjects, + factor_implementation_queried_graph_knowledge: FactorImplementationQueriedGraphKnowledge, + v2_query_error_limit: int = 5, + knowledge_sampler: float = 1.0, + ) -> QueriedKnowledge | None: + # queried_error_knowledge = FactorImplementationQueriedGraphErrorKnowledge() + for task_index, target_factor_task in enumerate(evo.target_factor_tasks): + target_factor_task_information = target_factor_task.get_factor_information() + factor_implementation_queried_graph_knowledge.error_with_success_task[target_factor_task_information] = {} + if ( + target_factor_task_information in self.knowledgebase.success_task_to_knowledge_dict + or target_factor_task_information in factor_implementation_queried_graph_knowledge.failed_task_info_set + ): + factor_implementation_queried_graph_knowledge.error_with_success_task[ + target_factor_task_information + ] = [] + else: + factor_implementation_queried_graph_knowledge.error_with_success_task[ + target_factor_task_information + ] = [] + if ( + target_factor_task_information in self.knowledgebase.working_trace_error_analysis + and len(self.knowledgebase.working_trace_error_analysis[target_factor_task_information]) > 0 + and len(factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information]) + > 0 + ): + queried_last_trace = factor_implementation_queried_graph_knowledge.former_traces[ + target_factor_task_information + ][-1] + target_index = self.knowledgebase.working_trace_knowledge[target_factor_task_information].index( + queried_last_trace, + ) + last_knowledge_error_analysis_result = self.knowledgebase.working_trace_error_analysis[ + target_factor_task_information + ][target_index] + else: + last_knowledge_error_analysis_result = [] + + error_nodes = [] + for error_node in last_knowledge_error_analysis_result: + if not isinstance(error_node, UndirectedNode): + error_node = self.knowledgebase.graph_get_node_by_content(content=error_node) + if error_node is None: + continue + error_nodes.append(error_node) + + if len(error_nodes) > 1: + task_trace_node_list = self.knowledgebase.graph_query_by_intersection( + error_nodes, + constraint_labels=["task_trace"], + output_intersection_origin=True, + ) + single_error_constraint = (v2_query_error_limit // len(error_nodes)) + 1 + else: + task_trace_node_list = [] + single_error_constraint = v2_query_error_limit + for error_node in error_nodes: + # Reverse iterate, a trade-off with intersection search + count = 0 + for task_trace_node in self.knowledgebase.graph_query_by_node( + node=error_node, + step=1, + constraint_labels=["task_trace"], + block=True, + )[::-1]: + if task_trace_node not in task_trace_node_list: + task_trace_node_list.append([[error_node], task_trace_node]) + count += 1 + if count >= single_error_constraint: + break + + # for error_node in last_knowledge_error_analysis_result: + # if not isinstance(error_node, UndirectedNode): + # error_node = self.knowledgebase.graph_get_node_by_content(content=error_node) + # if error_node is None: + # continue + # for searched_node in self.knowledgebase.graph_query_by_node( + # node=error_node, + # step=1, + # constraint_labels=["task_trace"], + # block=True, + # ): + # if searched_node not in [node[0] for node in task_trace_node_list]: + # task_trace_node_list.append((searched_node, error_node.content)) + + same_error_success_knowledge_pair_list = [] + same_error_success_node_set = set() + for error_node_list, trace_node in task_trace_node_list: + for searched_trace_success_node in self.knowledgebase.graph_query_by_node( + node=trace_node, + step=50, + constraint_labels=[ + "task_trace", + "task_success_implement", + "task_description", + ], + block=True, + ): + if ( + searched_trace_success_node not in same_error_success_node_set + and searched_trace_success_node.label == "task_success_implement" + ): + same_error_success_node_set.add(searched_trace_success_node) + + trace_knowledge = self.knowledgebase.node_to_implementation_knowledge_dict[trace_node.id] + success_knowledge = self.knowledgebase.node_to_implementation_knowledge_dict[ + searched_trace_success_node.id + ] + error_content = "" + for index, error_node in enumerate(error_node_list): + error_content += f"{index+1}. {error_node.content}; " + same_error_success_knowledge_pair_list.append( + ( + error_content, + (trace_knowledge, success_knowledge), + ), + ) + + if knowledge_sampler > 0: + same_error_success_knowledge_pair_list = [ + knowledge + for knowledge in same_error_success_knowledge_pair_list + if random.uniform(0, 1) <= knowledge_sampler + ] + + same_error_success_knowledge_pair_list = same_error_success_knowledge_pair_list[:v2_query_error_limit] + factor_implementation_queried_graph_knowledge.error_with_success_task[ + target_factor_task_information + ] = same_error_success_knowledge_pair_list + + return factor_implementation_queried_graph_knowledge + + +class FactorImplementationGraphKnowledgeBase(KnowledgeBase): + def __init__(self, init_component_list=None) -> None: + """ + Load knowledge, offer brief information of knowledge and common handle interfaces + """ + self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl") + FinCoLog().info(f"Knowledge Graph loaded, size={self.graph.size()}") + + if init_component_list: + for component in init_component_list: + exist_node = self.graph.get_node_by_content(content=component) + node = exist_node if exist_node else UndirectedNode(content=component, label="component") + self.graph.add_nodes(node=node, neighbors=[]) + + # A dict containing all working trace until they fail or succeed + self.working_trace_knowledge = {} + + # A dict containing error analysis each step aligned with working trace + self.working_trace_error_analysis = {} + + # Add already success task + self.success_task_to_knowledge_dict = {} + + # key:node_id(for task trace and success implement), value:knowledge instance(aka 'FactorImplementationKnowledge') + self.node_to_implementation_knowledge_dict = {} + + # store the task description to component nodes + self.task_to_component_nodes = {} + + def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]: + return self.graph.get_all_nodes_by_label(label) + + def update_success_task( + self, + success_task_info: str, + ): # Transfer the success tasks' working trace to knowledge storage & graph + success_task_trace = self.working_trace_knowledge[success_task_info] + success_task_error_analysis_record = ( + self.working_trace_error_analysis[success_task_info] + if success_task_info in self.working_trace_error_analysis + else [] + ) + task_des_node = UndirectedNode(content=success_task_info, label="task_description") + self.graph.add_nodes( + node=task_des_node, + neighbors=self.task_to_component_nodes[success_task_info], + ) # 1st version, we assume that all component nodes are given + for index, trace_unit in enumerate(success_task_trace): # every unit: single_knowledge + neighbor_nodes = [task_des_node] + if index != len(success_task_trace) - 1: + trace_node = UndirectedNode( + content=trace_unit.get_implementation_and_feedback_str(), + label="task_trace", + ) + self.node_to_implementation_knowledge_dict[trace_node.id] = trace_unit + for node_index, error_node in enumerate(success_task_error_analysis_record[index]): + if type(error_node).__name__ == "str": + queried_node = self.graph.get_node_by_content(content=error_node) + if queried_node is None: + new_error_node = UndirectedNode(content=error_node, label="error") + self.graph.add_node(node=new_error_node) + success_task_error_analysis_record[index][node_index] = new_error_node + else: + success_task_error_analysis_record[index][node_index] = queried_node + neighbor_nodes.extend(success_task_error_analysis_record[index]) + self.graph.add_nodes(node=trace_node, neighbors=neighbor_nodes) + else: + success_node = UndirectedNode( + content=trace_unit.get_implementation_and_feedback_str(), + label="task_success_implement", + ) + self.graph.add_nodes(node=success_node, neighbors=neighbor_nodes) + self.node_to_implementation_knowledge_dict[success_node.id] = trace_unit + + def query(self): + pass + + def graph_get_node_by_content(self, content: str) -> UndirectedNode: + return self.graph.get_node_by_content(content=content) + + def graph_query_by_content( + self, + content: Union[str, list[str]], + topk_k: int = 5, + step: int = 1, + constraint_labels: list[str] = None, + constraint_node: UndirectedNode = None, + similarity_threshold: float = 0.0, + constraint_distance: float = 0, + block: bool = False, + ) -> list[UndirectedNode]: + """ + search graph by content similarity and connection relationship, return empty list if nodes' chain without node + near to constraint_node + + Parameters + ---------- + constraint_distance + content + topk_k: the upper number of output for each query, if the number of fit nodes is less than topk_k, return all fit nodes's content + step + constraint_labels + constraint_node + similarity_threshold + block: despite the start node, the search can only flow through the constraint_label type nodes + + Returns + ------- + + """ + + return self.graph.query_by_content( + content=content, + topk_k=topk_k, + step=step, + constraint_labels=constraint_labels, + constraint_node=constraint_node, + similarity_threshold=similarity_threshold, + constraint_distance=constraint_distance, + block=block, + ) + + def graph_query_by_node( + self, + node: UndirectedNode, + step: int = 1, + constraint_labels: list[str] = None, + constraint_node: UndirectedNode = None, + constraint_distance: float = 0, + block: bool = False, + ) -> list[UndirectedNode]: + """ + search graph by connection, return empty list if nodes' chain without node near to constraint_node + Parameters + ---------- + node : start node + step : the max steps will be searched + constraint_labels : the labels of output nodes + constraint_node : the node that the output nodes must connect to + constraint_distance : the max distance between output nodes and constraint_node + block: despite the start node, the search can only flow through the constraint_label type nodes + + Returns + ------- + A list of nodes + + """ + nodes = self.graph.query_by_node( + node=node, + step=step, + constraint_labels=constraint_labels, + constraint_node=constraint_node, + constraint_distance=constraint_distance, + block=block, + ) + return nodes + + def graph_query_by_intersection( + self, + nodes: list[UndirectedNode], + steps: int = 1, + constraint_labels: list[str] = None, + output_intersection_origin: bool = False, + ) -> list[UndirectedNode] | list[list[list[UndirectedNode], UndirectedNode]]: + """ + search graph by node intersection, node intersected by a higher frequency has a prior order in the list + Parameters + ---------- + nodes : node list + step : the max steps will be searched + constraint_labels : the labels of output nodes + output_intersection_origin: output the list that contains the node which form this intersection node + + Returns + ------- + A list of nodes + + """ + node_count = len(nodes) + assert node_count >= 2, "nodes length must >=2" + intersection_node_list = [] + if output_intersection_origin: + origin_list = [] + for k in range(node_count, 1, -1): + possible_combinations = combinations(nodes, k) + for possible_combination in possible_combinations: + node_list = list(possible_combination) + intersection_node_list.extend( + self.graph.get_nodes_intersection(node_list, steps=steps, constraint_labels=constraint_labels) + ) + if output_intersection_origin: + for _ in range(len(intersection_node_list)): + origin_list.append(node_list) + intersection_node_list_sort_by_freq = [] + for index, node in enumerate(intersection_node_list): + if node not in intersection_node_list_sort_by_freq: + if output_intersection_origin: + intersection_node_list_sort_by_freq.append([origin_list[index], node]) + else: + intersection_node_list_sort_by_freq.append(node) + + return intersection_node_list_sort_by_freq diff --git a/rdagent/factor_implementation/share_modules/conf.py b/rdagent/factor_implementation/share_modules/conf.py new file mode 100644 index 00000000..574c5dfc --- /dev/null +++ b/rdagent/factor_implementation/share_modules/conf.py @@ -0,0 +1,42 @@ +from pathlib import Path + +from finco.conf import FincoSettings + + +class FactorImplementSettings(FincoSettings): + file_based_execution_data_folder: str = str( + (Path().cwd() / "git_ignore_folder" / "factor_implementation_source_data").absolute(), + ) + file_based_execution_workspace: str = str( + (Path().cwd() / "git_ignore_folder" / "factor_implementation_workspace").absolute(), + ) + implementation_execution_cache_location: str = str( + (Path().cwd() / "git_ignore_folder" / "factor_implementation_execution_cache.pkl").absolute(), + ) + enable_execution_cache: bool = True # whether to enable the execution cache + + # TODO: the factor implement specific settings should not appear in this settings + # Evolving should have a method specific settings + # evolving related config + fail_task_trial_limit: int = 20 + + v1_query_former_trace_limit: int = 5 + v1_query_similar_success_limit: int = 5 + + v2_query_component_limit: int = 1 + v2_query_error_limit: int = 1 + v2_query_former_trace_limit: int = 1 + v2_error_summary: bool = False + v2_knowledge_sampler: float = 1.0 + + chat_token_limit: int = ( + 100000 # 100000 is the maximum limit of gpt4, which might increase in the future version of gpt + ) + + implementation_factors_per_round: int = 100 # how many factors to choose for each round of evolving + evo_multi_proc_n: int = 16 # how many processes to use for evolving (including eval & generation) + + file_based_execution_timeout: int = 120 # seconds for each factor implementation execution + + +FIS = FactorImplementSettings() diff --git a/rdagent/factor_implementation/share_modules/evaluator.py b/rdagent/factor_implementation/share_modules/evaluator.py new file mode 100644 index 00000000..0596525a --- /dev/null +++ b/rdagent/factor_implementation/share_modules/evaluator.py @@ -0,0 +1,535 @@ +import json +from abc import ABC, abstractmethod +from typing import Tuple + +import pandas as pd +from jinja2 import Template + +from oai.llm_utils import APIBackend +from finco.log import FinCoLog +from factor_implementation.share_modules.conf import FactorImplementSettings +from factor_implementation.share_modules.factor import ( + FactorImplementation, + FactorImplementationTask, +) +from factor_implementation.share_modules.prompt import ( + FactorImplementationPrompts, +) + + +class Evaluator(ABC): + @abstractmethod + def evaluate( + self, + target_task: FactorImplementationTask, + implementation: FactorImplementation, + gt_implementation: FactorImplementation, + **kwargs, + ): + raise NotImplementedError + + +class FactorImplementationCodeEvaluator(Evaluator): + def evaluate( + self, + target_task: FactorImplementationTask, + implementation: FactorImplementation, + execution_feedback: str, + factor_value_feedback: str = "", + gt_implementation: FactorImplementation = None, + **kwargs, + ): + factor_information = target_task.get_factor_information() + code = implementation.code + + system_prompt = FactorImplementationPrompts()["evaluator_code_feedback_v1_system"] + + execution_feedback_to_render = execution_feedback + user_prompt = Template( + FactorImplementationPrompts()["evaluator_code_feedback_v1_user"], + ).render( + factor_information=factor_information, + code=code, + execution_feedback=execution_feedback_to_render, + factor_value_feedback=factor_value_feedback, + gt_code=gt_implementation.code if gt_implementation else None, + ) + while ( + APIBackend().build_messages_and_calculate_token( + user_prompt=user_prompt, + system_prompt=system_prompt, + former_messages=[], + ) + > FactorImplementSettings().chat_token_limit + ): + execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :] + user_prompt = Template( + FactorImplementationPrompts()["evaluator_code_feedback_v1_user"], + ).render( + factor_information=factor_information, + code=code, + execution_feedback=execution_feedback_to_render, + factor_value_feedback=factor_value_feedback, + gt_code=gt_implementation.code if gt_implementation else None, + ) + critic_response = APIBackend().build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=False, + ) + + # critic_response = json.loads(critic_response) + return critic_response + + +class FactorImplementationEvaluator(Evaluator): + # TODO: + # I think we should have unified interface for all evaluates, for examples. + # So we should adjust the interface of other factors + @abstractmethod + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + """You can get the dataframe by + + .. code-block:: python + + _, gt_df = gt.execute() + _, gen_df = gen.execute() + + Returns + ------- + Tuple[str, object] + - str: the text-based description of the evaluation result + - object: a comparable metric (bool, integer, float ...) + + """ + raise NotImplementedError("Please implement the `evaluator` method") + + def _get_df(self, gt: FactorImplementation, gen: FactorImplementation): + _, gt_df = gt.execute() + _, gen_df = gen.execute() + if isinstance(gen_df, pd.Series): + gen_df = gen_df.to_frame("source_factor") + if isinstance(gt_df, pd.Series): + gt_df = gt_df.to_frame("gt_factor") + return gt_df, gen_df + + +# NOTE: the following evaluators are splited from FactorImplementationValueEvaluator + + +class FactorImplementationSingleColumnEvaluator(FactorImplementationEvaluator): + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + + if len(gen_df.columns) == 1 and len(gt_df.columns) == 1: + return "Both dataframes have only one column.", True + elif len(gen_df.columns) != 1: + gen_df = gen_df.iloc(axis=1)[ + [ + 0, + ] + ] + return ( + "The source dataframe has more than one column. Please check the implementation. We only evaluate the first column.", + False, + ) + return "", False + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationIndexFormatEvaluator(FactorImplementationEvaluator): + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + idx_name_right = gen_df.index.names == ("datetime", "instrument") + if idx_name_right: + return ( + 'The index of the dataframe is ("datetime", "instrument") and align with the predefined format.', + True, + ) + else: + return ( + 'The index of the dataframe is not ("datetime", "instrument"). Please check the implementation.', + False, + ) + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationRowCountEvaluator(FactorImplementationEvaluator): + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + + if gen_df.shape[0] == gt_df.shape[0]: + return "Both dataframes have the same rows count.", True + else: + return ( + f"The source dataframe and the ground truth dataframe have different rows count. The source dataframe has {gen_df.shape[0]} rows, while the ground truth dataframe has {gt_df.shape[0]} rows. Please check the implementation.", + False, + ) + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationIndexEvaluator(FactorImplementationEvaluator): + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + + if gen_df.index.equals(gt_df.index): + return "Both dataframes have the same index.", True + else: + return ( + "The source dataframe and the ground truth dataframe have different index. Please check the implementation.", + False, + ) + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationMissingValuesEvaluator(FactorImplementationEvaluator): + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + + if gen_df.isna().sum().sum() == gt_df.isna().sum().sum(): + return "Both dataframes have the same missing values.", True + else: + return ( + f"The dataframes do not have the same missing values. The source dataframe has {gen_df.isna().sum().sum()} missing values, while the ground truth dataframe has {gt_df.isna().sum().sum()} missing values. Please check the implementation.", + False, + ) + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationValuesEvaluator(FactorImplementationEvaluator): + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + + try: + close_values = gen_df.sub(gt_df).abs().lt(1e-6) + result_int = close_values.astype(int) + pos_num = result_int.sum().sum() + acc_rate = pos_num / close_values.size + except: + close_values = gen_df + if close_values.all().iloc[0]: + return ( + "All values in the dataframes are equal within the tolerance of 1e-6.", + acc_rate, + ) + else: + return ( + "Some values differ by more than the tolerance of 1e-6. Check for rounding errors or differences in the calculation methods.", + acc_rate, + ) + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationCorrelationEvaluator(FactorImplementationEvaluator): + def __init__(self, hard_check: bool) -> None: + self.hard_check = hard_check + + def evaluate( + self, + gt: FactorImplementation, + gen: FactorImplementation, + ) -> Tuple[str, object]: + gt_df, gen_df = self._get_df(gt, gen) + + concat_df = pd.concat([gen_df, gt_df], axis=1) + concat_df.columns = ["source", "gt"] + ic = concat_df.groupby("datetime").apply(lambda df: df["source"].corr(df["gt"])).dropna().mean() + ric = ( + concat_df.groupby("datetime") + .apply(lambda df: df["source"].corr(df["gt"], method="spearman")) + .dropna() + .mean() + ) + + if self.hard_check: + if ic > 0.99 and ric > 0.99: + return ( + f"The dataframes are highly correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}.", + True, + ) + else: + return ( + f"The dataframes are not sufficiently high correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent.", + False, + ) + else: + return f"The ic is ({ic:.6f}) and the rankic is ({ric:.6f}).", ic + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationValEvaluator(FactorImplementationEvaluator): + def evaluate(self, gt: FactorImplementation, gen: FactorImplementation): + _, gt_df = gt.execute() + _, gen_df = gen.execute() + # FIXME: refactor the two classes + fiv = FactorImplementationValueEvaluator() + return fiv.evaluate(source_df=gen_df, gt_df=gt_df) + + def __str__(self) -> str: + return self.__class__.__name__ + + +class FactorImplementationValueEvaluator(Evaluator): + # TODO: let's discuss the about the interface of the evaluator + def evaluate( + self, + source_df: pd.DataFrame, + gt_df: pd.DataFrame, + **kwargs, + ) -> Tuple: + conclusions = [] + + if isinstance(source_df, pd.Series): + source_df = source_df.to_frame("source_factor") + conclusions.append( + "The source dataframe is a series, better convert it to a dataframe.", + ) + if gt_df is not None and isinstance(gt_df, pd.Series): + gt_df = gt_df.to_frame("gt_factor") + conclusions.append( + "The ground truth dataframe is a series, convert it to a dataframe.", + ) + + # Check if both dataframe has only one columns + if len(source_df.columns) == 1: + conclusions.append("The source dataframe has only one column which is correct.") + else: + conclusions.append( + "The source dataframe has more than one column. Please check the implementation. We only evaluate the first column.", + ) + source_df = source_df.iloc(axis=1)[ + [ + 0, + ] + ] + + if list(source_df.index.names) != ["datetime", "instrument"]: + conclusions.append( + rf"The index of the dataframe is not (\"datetime\", \"instrument\"), instead is {source_df.index.names}. Please check the implementation.", + ) + else: + conclusions.append( + 'The index of the dataframe is ("datetime", "instrument") and align with the predefined format.', + ) + + # Check if both dataframe have the same rows count + if gt_df is not None: + if source_df.shape[0] == gt_df.shape[0]: + conclusions.append("Both dataframes have the same rows count.") + same_row_count_result = True + else: + conclusions.append( + f"The source dataframe and the ground truth dataframe have different rows count. The source dataframe has {source_df.shape[0]} rows, while the ground truth dataframe has {gt_df.shape[0]} rows. Please check the implementation.", + ) + same_row_count_result = False + + # Check whether both dataframe has the same index + if source_df.index.equals(gt_df.index): + conclusions.append("Both dataframes have the same index.") + same_index_result = True + else: + conclusions.append( + "The source dataframe and the ground truth dataframe have different index. Please check the implementation.", + ) + same_index_result = False + + # Check for the same missing values (NaN) + if source_df.isna().sum().sum() == gt_df.isna().sum().sum(): + conclusions.append("Both dataframes have the same missing values.") + same_missing_values_result = True + else: + conclusions.append( + f"The dataframes do not have the same missing values. The source dataframe has {source_df.isna().sum().sum()} missing values, while the ground truth dataframe has {gt_df.isna().sum().sum()} missing values. Please check the implementation.", + ) + same_missing_values_result = False + + # Check if the values are the same within a small tolerance + if not same_index_result: + conclusions.append( + "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless", + ) + same_values_result = False + high_correlation_result = False + else: + close_values = source_df.sub(gt_df).abs().lt(1e-6) + if close_values.all().iloc[0]: + conclusions.append( + "All values in the dataframes are equal within the tolerance of 1e-6.", + ) + same_values_result = True + else: + conclusions.append( + "Some values differ by more than the tolerance of 1e-6. Check for rounding errors or differences in the calculation methods.", + ) + same_values_result = False + + # Check the ic and rankic between the two dataframes + concat_df = pd.concat([source_df, gt_df], axis=1) + concat_df.columns = ["source", "gt"] + try: + ic = concat_df.groupby("datetime").apply(lambda df: df["source"].corr(df["gt"])).dropna().mean() + ric = ( + concat_df.groupby("datetime") + .apply(lambda df: df["source"].corr(df["gt"], method="spearman")) + .dropna() + .mean() + ) + + if ic > 0.99 and ric > 0.99: + conclusions.append( + f"The dataframes are highly correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}.", + ) + high_correlation_result = True + else: + conclusions.append( + f"The dataframes are not sufficiently high correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent.", + ) + high_correlation_result = False + + # Check for shifted alignments only in the "datetime" index + max_shift_days = 2 + for shift in range(-max_shift_days, max_shift_days + 1): + if shift == 0: + continue # Skip the case where there is no shift + + shifted_source_df = source_df.groupby(level="instrument").shift(shift) + concat_df = pd.concat([shifted_source_df, gt_df], axis=1) + concat_df.columns = ["source", "gt"] + shifted_ric = ( + concat_df.groupby("datetime") + .apply(lambda df: df["source"].corr(df["gt"], method="spearman")) + .dropna() + .mean() + ) + if shifted_ric > 0.99: + conclusions.append( + f"The dataframes are highly correlated with a shift of {max_shift_days} days in the 'date' index. Shifted rankic: {shifted_ric:.6f}.", + ) + break + else: + conclusions.append( + f"No sufficient correlation found when shifting up to {max_shift_days} days in the 'date' index. Investigate the factors that might be causing discrepancies.", + ) + + except Exception as e: + FinCoLog().warning(f"Error occurred when calculating the correlation: {str(e)}") + conclusions.append( + f"Some error occurred when calculating the correlation. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent. Error: {e}", + ) + high_correlation_result = False + + # Combine all conclusions into a single string + conclusion_str = "\n".join(conclusions) + + final_result = (same_values_result or high_correlation_result) if gt_df is not None else False + return conclusion_str, final_result + + +# TODO: +def shorten_prompt(tpl: str, render_kwargs: dict, shorten_key: str, max_trail: int = 10) -> str: + """When the prompt is too long. We have to shorten it. + But we should not truncate the prompt directly, so we should find the key we want to shorten and then shorten it. + """ + # TODO: this should replace most of code in + # - FactorImplementationFinalDecisionEvaluator.evaluate + # - FactorImplementationCodeEvaluator.evaluate + + +class FactorImplementationFinalDecisionEvaluator(Evaluator): + def evaluate( + self, + target_task: FactorImplementationTask, + execution_feedback: str, + value_feedback: str, + code_feedback: str, + **kwargs, + ) -> Tuple: + system_prompt = FactorImplementationPrompts()["evaluator_final_decision_v1_system"] + execution_feedback_to_render = execution_feedback + user_prompt = Template( + FactorImplementationPrompts()["evaluator_final_decision_v1_user"], + ).render( + factor_information=target_task.get_factor_information(), + execution_feedback=execution_feedback_to_render, + code_feedback=code_feedback, + factor_value_feedback=( + value_feedback + if value_feedback is not None + else "No Ground Truth Value provided, so no evaluation on value is performed." + ), + ) + while ( + APIBackend().build_messages_and_calculate_token( + user_prompt=user_prompt, + system_prompt=system_prompt, + former_messages=[], + ) + > FactorImplementSettings().chat_token_limit + ): + execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :] + user_prompt = Template( + FactorImplementationPrompts()["evaluator_final_decision_v1_user"], + ).render( + factor_information=target_task.get_factor_information(), + execution_feedback=execution_feedback_to_render, + code_feedback=code_feedback, + factor_value_feedback=( + value_feedback + if value_feedback is not None + else "No Ground Truth Value provided, so no evaluation on value is performed." + ), + ) + + final_evaluation_dict = json.loads( + APIBackend().build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=True, + ), + ) + return ( + final_evaluation_dict["final_decision"], + final_evaluation_dict["final_feedback"], + ) diff --git a/rdagent/factor_implementation/share_modules/exception.py b/rdagent/factor_implementation/share_modules/exception.py new file mode 100644 index 00000000..5260bc50 --- /dev/null +++ b/rdagent/factor_implementation/share_modules/exception.py @@ -0,0 +1,26 @@ +class ImplementRunException(Exception): + """ + Exceptions raised when Implementing and running code. + - start: FactorImplementationTask => FactorGenerator + - end: Get dataframe after execution + + The more detailed evaluation in dataframe values are managed by the evaluator. + """ + + +class CodeFormatException(ImplementRunException): + """ + The generated code is not found due format error. + """ + + +class RuntimeErrorException(ImplementRunException): + """ + The generated code fail to execute the script. + """ + + +class NoOutputException(ImplementRunException): + """ + The code fail to generate output file. + """ diff --git a/rdagent/factor_implementation/share_modules/factor.py b/rdagent/factor_implementation/share_modules/factor.py new file mode 100644 index 00000000..1a37ca38 --- /dev/null +++ b/rdagent/factor_implementation/share_modules/factor.py @@ -0,0 +1,221 @@ +import pickle +import subprocess +import uuid +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Tuple, Union + +import pandas as pd +from filelock import FileLock + +from oai.llm_utils import md5_hash +from finco.log import FinCoLog +from factor_implementation.share_modules.conf import FactorImplementSettings +from factor_implementation.share_modules.exception import ( + CodeFormatException, + NoOutputException, + RuntimeErrorException, +) + + +class FactorImplementationTask: + # TODO: remove the factor_ prefix may be better + def __init__( + self, + factor_name, + factor_description, + factor_formulation, + factor_formulation_description, + variables: dict = {}, + ) -> None: + self.factor_name = factor_name + self.factor_description = factor_description + self.factor_formulation = factor_formulation + self.factor_formulation_description = factor_formulation_description + # TODO: check variables a good candidate + self.variables = variables + + def get_factor_information(self): + return f"""factor_name: {self.factor_name} +factor_description: {self.factor_description} +factor_formulation: {self.factor_formulation} +factor_formulation_description: {self.factor_formulation_description}""" + + @staticmethod + def from_dict(dict): + return FactorImplementationTask(**dict) + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}[{self.factor_name}]>" + + +class FactorImplementation(ABC): + def __init__(self, target_task: FactorImplementationTask) -> None: + self.target_task = target_task + + @abstractmethod + def execute(self, *args, **kwargs) -> Tuple[str, pd.DataFrame]: + raise NotImplementedError("__call__ method is not implemented.") + + +class FileBasedFactorImplementation(FactorImplementation): + """ + This class is used to implement a factor by writing the code to a file. + Input data and output factor value are also written to files. + """ + + # TODO: (Xiao) think raising errors may get better information for processing + FB_FROM_CACHE = "The factor value has been executed and stored in the instance variable." + FB_EXEC_SUCCESS = "Execution succeeded without error." + FB_CODE_NOT_SET = "code is not set." + FB_EXECUTION_SUCCEEDED = "Execution succeeded without error." + FB_OUTPUT_FILE_NOT_FOUND = "\nExpected output file not found." + FB_OUTPUT_FILE_FOUND = "\nExpected output file found." + + def __init__( + self, + target_task: FactorImplementationTask, + code, + executed_factor_value_dataframe=None, + raise_exception=False, + ) -> None: + super().__init__(target_task) + self.code = code + self.executed_factor_value_dataframe = executed_factor_value_dataframe + self.logger = FinCoLog() + self.raise_exception = raise_exception + self.workspace_path = Path( + FactorImplementSettings().file_based_execution_workspace, + ) / str(uuid.uuid4()) + + @staticmethod + def link_data_to_workspace(data_path: Path, workspace_path: Path): + data_path = Path(data_path) + workspace_path = Path(workspace_path) + for data_file_path in data_path.iterdir(): + workspace_data_file_path = workspace_path / data_file_path.name + if workspace_data_file_path.exists(): + workspace_data_file_path.unlink() + subprocess.run( + ["ln", "-s", data_file_path, workspace_data_file_path], + check=False, + ) + + def execute(self, store_result: bool = False) -> Tuple[str, pd.DataFrame]: + """ + execute the implementation and get the factor value by the following steps: + 1. make the directory in workspace path + 2. write the code to the file in the workspace path + 3. link all the source data to the workspace path folder + 4. execute the code + 5. read the factor value from the output file in the workspace path folder + returns the execution feedback as a string and the factor value as a pandas dataframe + + parameters: + store_result: if True, store the factor value in the instance variable, this feature is to be used in the gt implementation to avoid multiple execution on the same gt implementation + """ + if self.code is None: + if self.raise_exception: + raise CodeFormatException(self.FB_CODE_NOT_SET) + else: + # TODO: to make the interface compatible with previous code. I kept the original behavior. + raise ValueError(self.FB_CODE_NOT_SET) + with FileLock(self.workspace_path / "execution.lock"): + (Path.cwd() / "git_ignore_folder" / "factor_implementation_execution_cache").mkdir( + exist_ok=True, parents=True + ) + if FactorImplementSettings().enable_execution_cache: + # NOTE: cache the result for the same code + target_file_name = md5_hash(self.code) + cache_file_path = ( + Path.cwd() + / "git_ignore_folder" + / "factor_implementation_execution_cache" + / f"{target_file_name}.pkl" + ) + if cache_file_path.exists() and not self.raise_exception: + cached_res = pickle.load(open(cache_file_path, "rb")) + if store_result and cached_res[1] is not None: + self.executed_factor_value_dataframe = cached_res[1] + return cached_res + + if self.executed_factor_value_dataframe is not None: + return self.FB_FROM_CACHE, self.executed_factor_value_dataframe + + source_data_path = Path( + FactorImplementSettings().file_based_execution_data_folder, + ) + self.workspace_path.mkdir(exist_ok=True, parents=True) + + code_path = self.workspace_path / f"{self.target_task.factor_name}.py" + code_path.write_text(self.code) + + self.link_data_to_workspace(source_data_path, self.workspace_path) + + execution_feedback = self.FB_EXECUTION_SUCCEEDED + try: + subprocess.check_output( + f"python {code_path}", + shell=True, + cwd=self.workspace_path, + stderr=subprocess.STDOUT, + timeout=FactorImplementSettings().file_based_execution_timeout, + ) + except subprocess.CalledProcessError as e: + import site + + execution_feedback = ( + e.output.decode() + .replace(str(code_path.parent.absolute()), r"/path/to") + .replace(str(site.getsitepackages()[0]), r"/path/to/site-packages") + ) + if len(execution_feedback) > 2000: + execution_feedback = ( + execution_feedback[:1000] + "....hidden long error message...." + execution_feedback[-1000:] + ) + if self.raise_exception: + raise RuntimeErrorException(execution_feedback) + except subprocess.TimeoutExpired: + execution_feedback += f"Execution timeout error and the timeout is set to {FactorImplementSettings().file_based_execution_timeout} seconds." + if self.raise_exception: + raise RuntimeErrorException(execution_feedback) + + workspace_output_file_path = self.workspace_path / "result.h5" + if not workspace_output_file_path.exists(): + execution_feedback += self.FB_OUTPUT_FILE_NOT_FOUND + executed_factor_value_dataframe = None + if self.raise_exception: + raise NoOutputException(execution_feedback) + else: + try: + executed_factor_value_dataframe = pd.read_hdf(workspace_output_file_path) + execution_feedback += self.FB_OUTPUT_FILE_FOUND + except Exception as e: + execution_feedback += f"Error found when reading hdf file: {e}"[:1000] + executed_factor_value_dataframe = None + + if store_result and executed_factor_value_dataframe is not None: + self.executed_factor_value_dataframe = executed_factor_value_dataframe + + if FactorImplementSettings().enable_execution_cache: + pickle.dump( + (execution_feedback, executed_factor_value_dataframe), + open(cache_file_path, "wb"), + ) + return execution_feedback, executed_factor_value_dataframe + + def __str__(self) -> str: + # NOTE: + # If the code cache works, the workspace will be None. + return f"File Factor[{self.target_task.factor_name}]: {self.workspace_path}" + + def __repr__(self) -> str: + return self.__str__() + + @staticmethod + def from_folder(task: FactorImplementationTask, path: Union[str, Path], **kwargs): + path = Path(path) + factor_path = (path / task.factor_name).with_suffix(".py") + with factor_path.open("r") as f: + code = f.read() + return FileBasedFactorImplementation(task, code=code, **kwargs) diff --git a/rdagent/factor_implementation/share_modules/factor_gen.py b/rdagent/factor_implementation/share_modules/factor_gen.py new file mode 100644 index 00000000..ff4bfc24 --- /dev/null +++ b/rdagent/factor_implementation/share_modules/factor_gen.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod +from typing import List + +from factor_implementation.share_modules.factor import ( + FactorImplementation, + FactorImplementationTask, +) + + +class FactorGenerator(ABC): + """ + Because implementing factors will help each other in the process of implementation, we use the interface `List[FactorImplementationTask] -> List[FactorImplementation]` instead of single factor . + """ + + def __init__(self, target_task_l: List[FactorImplementationTask]) -> None: + self.target_task_l = target_task_l + + @abstractmethod + def generate(self, *args, **kwargs) -> List[FactorImplementation]: + raise NotImplementedError("generate method is not implemented.") + + def collect_feedback(self, feedback_obj_l: List[object]): + """ + When online evaluation. + The preivous feedbacks will be collected to support advanced factor generator + + Parameters + ---------- + feedback_obj_l : List[object] + + """ diff --git a/rdagent/factor_implementation/share_modules/prompt.py b/rdagent/factor_implementation/share_modules/prompt.py new file mode 100644 index 00000000..765c5d94 --- /dev/null +++ b/rdagent/factor_implementation/share_modules/prompt.py @@ -0,0 +1,23 @@ +from pathlib import Path +from typing import Dict + +import yaml + +from finco.utils import SingletonBaseClass + + +class FactorImplementationPrompts(Dict, SingletonBaseClass): + def __init__(self): + super().__init__() + prompt_yaml_path = Path(__file__).parent / "prompts.yaml" + + prompt_yaml_dict = yaml.load( + open( + prompt_yaml_path, + encoding="utf8", + ), + Loader=yaml.FullLoader, + ) + + for key, value in prompt_yaml_dict.items(): + self[key] = value diff --git a/rdagent/factor_implementation/share_modules/prompts.yaml b/rdagent/factor_implementation/share_modules/prompts.yaml new file mode 100644 index 00000000..3b0ed0ad --- /dev/null +++ b/rdagent/factor_implementation/share_modules/prompts.yaml @@ -0,0 +1,196 @@ +evaluator_code_feedback_v1_system: |- + Your job is to give critic to user's code. User's code is expected to implement some factors in quant investment. The code contains reading data from a HDF5(H5) file, calculate the factor to each instrument on each datetime, and save the result pandas dataframe to a HDF5(H5) file. + + User will firstly provide you the information of the factor, which includes the name of the factor, description of the factor, the formulation of the factor and the description of the formulation. You can check whether user's code is align with the factor. + + The user will provide the source python code and the execution error message if execution failed. + The user might provide you the ground truth code for you to provide the critic. You should not leak the ground truth code to the user in any form but you can use it to provide the critic. + + User has also compared the factor values calculated by the user's code and the ground truth code. The user will provide you some analyze result comparing two output. You may find some error in the code which caused the difference between the two output. + + If the ground truth code is provided, your critic should only consider checking whether the user's code is align with the ground truth code since the ground truth is definitely correct. + If the ground truth code is not provided, your critic should consider checking whether the user's code is reasonable and correct. + + You should provide the suggestion to each of your critic to help the user improve the code. Please response the critic in the following format. Here is an example structure for the output: + critic 1: The critic message to critic 1 + critic 2: The critic message to critic 2 +evaluator_code_feedback_v1_user: |- + --------------Factor information:--------------- + {{ factor_information }} + --------------Python code:--------------- + {{ code }} + --------------Execution feedback:--------------- + {{ execution_feedback }} + {% if factor_value_feedback is not none %} + --------------Factor value feedback:--------------- + {{ factor_value_feedback }} + {% endif %} + {% if gt_code is not none %} + --------------Ground truth Python code:--------------- + {{ gt_code }} + {% endif %} +evolving_strategy_factor_implementation_v1_system: |- + The user is trying to implement some factors in quant investment, and you are the one to help write the python code. + + {{ data_info }} + + The user will provide you a formulation of the factor, which contains some function calls and some operators. You need to implement the function calls and operators in python. Your code is expected to align the formulation in any form which means The user needs to get the exact factor values with your code as expected. + + Your code should contain the following part: the import part, the function part, and the main part. You should write a main function name: "calculate_{function_name}" and call this function in "if __name__ == __main__" part. Don't write any try-except block in your code. The user will catch the exception message and provide the feedback to you. + + User will write your code into a python file and execute the file directly with "python {your_file_name}.py". You should calculate the factor values and save the result into a HDF5(H5) file named "result.h5" in the same directory as your python file. The result file is a HDF5(H5) file containing a pandas dataframe. The index of the dataframe is the "datetime" and "instrument", and the single column name is the factor name,and the value is the factor value. The result file should be saved in the same directory as your python file. + + To help you write the correct code, the user might provide multiple information that helps you write the correct code: + 1. The user might provide you the correct code to similar factors. Your should learn from these code to write the correct code. + 2. The user might provide you the failed former code and the corresponding feedback to the code. The feedback contains to the execution, the code and the factor value. You should analyze the feedback and try to correct the latest code. + 3. The user might provide you the suggestion to the latest fail code and some similar fail to correct pairs. Each pair contains the fail code with similar error and the corresponding corrected version code. You should learn from these suggestion to write the correct code. + + Your must write your code based on your former lastest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code. + {% if queried_former_failed_knowledge|length != 0 %} + --------------Your former latest attempt:--------------- + {% for former_failed_knowledge in queried_former_failed_knowledge %} + =====Code to implementation {{ loop.index }}===== + {{ former_failed_knowledge.implementation.code }} + =====Feedback to implementation {{ loop.index }}===== + {{ former_failed_knowledge.feedback }} + {% endfor %} + {% endif %} + + A typical format of `result.h5` may be like following: + datetime instrument + 2020-01-02 SZ000001 -0.001796 + SZ000166 0.005780 + SZ000686 0.004228 + SZ000712 0.001298 + SZ000728 0.005330 + ... + 2021-12-31 SZ000750 0.000000 + SZ000776 0.002459 + + Please response the code in the following json format. Here is an example structure for the JSON output: + { + "code": "The Python code as a string." + } + +evolving_strategy_factor_implementation_v1_user: |- + --------------Target factor information:--------------- + {{ factor_information_str }} + + {% if queried_similar_successful_knowledge|length != 0 %} + --------------Correct code to similar factors:--------------- + {% for similar_successful_knowledge in queried_similar_successful_knowledge %} + =====Factor {{loop.index}}:===== + {{ similar_successful_knowledge.target_task.get_factor_information() }} + =====Code:===== + {{ similar_successful_knowledge.implementation.code }} + {% endfor %} + {% endif %} + + {% if queried_former_failed_knowledge|length != 0 %} + --------------Former failed code:--------------- + {% for former_failed_knowledge in queried_former_failed_knowledge %} + =====Code to implementation {{ loop.index }}===== + {{ former_failed_knowledge.implementation.code }} + =====Feedback to implementation {{ loop.index }}===== + {{ former_failed_knowledge.feedback }} + {% endfor %} + {% endif %} + +evaluator_final_decision_v1_system: |- + User is trying to implement some factor in quant investment and has finished a version of implementation. User has finished evaluation and got some feedback from the evaluator. + The evaluator run the code and get the factor value dataframe and provide several feedback regarding user's code and code output. You should analyze the feedback and considering the factor description to give a final decision about the evaluation result. The final decision concludes whether the factor is implemented correctly and if not, detail feedback containing reason and suggestion if the final decision is False. + + The implementation final decision is considered in the following logic: + 1. If the value and the ground truth value are exactly the same under a small tolerance, the implementation is considered correct. + 2. If the value and the ground truth value have a high correlation on ic or rank ic, the implementation is considered correct. + 3. If no ground truth value is not provided, the implementation is considered correct if the code execution is successful and the code feedback is reasonable. + + Please response the critic in the json format. Here is an example structure for the JSON output, please strictly follow the format: + { + "final_decision": True, + "final_feedback": "The final feedback message", + } + +evaluator_final_decision_v1_user: |- + --------------Factor information:--------------- + {{ factor_information }} + --------------Execution feedback:--------------- + {{ execution_feedback }} + --------------Code feedback:--------------- + {{ code_feedback }} + --------------Factor value feedback:--------------- + {{ factor_value_feedback }} + + +analyze_component_prompt_v1_system: |- + User is getting a new task that might consist of the components below (given in component_index: component_description): + {{all_component_content}} + + You should find out what components does the new task have, and put their indices in a list. + Please response the critic in the json format. Here is an example structure for the JSON output, please strictly follow the format: + { + "component_no_list": the list containing indices of components. + } + + + +evolving_strategy_factor_implementation_v2_user: |- + --------------Target factor information:--------------- + {{ factor_information_str }} + + {% if queried_similar_error_knowledge|length != 0 %} + {% if not error_summary %} + Recall your last failure, your implementation met some errors. + When doing other tasks, you met some similar errors but you finally solve them. Here are some examples: + {% for error_content, similar_error_knowledge in queried_similar_error_knowledge %} + --------------Factor information to similar error ({{error_content}}):--------------- + {{ similar_error_knowledge[0].target_task.get_factor_information() }} + =====Code with similar error ({{error_content}}):===== + {{ similar_error_knowledge[0].implementation.code }} + =====Success code to former code with similar error ({{error_content}}):===== + {{ similar_error_knowledge[1].implementation.code }} + {% endfor %} + {% else %} + Recall your last failure, your implementation met some errors. + After reviewing some similar errors and their solutions, here are some suggestions for you to correct your code: + {{error_summary_critics}} + {% endif %} + {% endif %} + {% if queried_similar_component_knowledge|length != 0 %} + Here are some success implements of similar component tasks, take them as references: + --------------Correct code to similar factors:--------------- + {% for similar_component_knowledge in queried_similar_component_knowledge %} + =====Factor {{loop.index}}:===== + {{ similar_component_knowledge.target_task.get_factor_information() }} + =====Code:===== + {{ similar_component_knowledge.implementation.code }} + {% endfor %} + {% endif %} + + +evolving_strategy_error_summary_v2_system: |- + You are doing the following task: + {{factor_information_str}} + + You have written some code but it meets errors like the following: + {{code_and_feedback}} + + The user has found some tasks that met similar errors, and their final correct solutions. + Please refer to these similar errors and their solutions, provide some clear, short and accurate critics that might help you solve the issues in your code. + + Please response the critic in the following format. Here is an example structure for the output: + critic 1: The critic message to critic 1 + critic 2: The critic message to critic 2 + +evolving_strategy_error_summary_v2_user: |- + {% if queried_similar_error_knowledge|length != 0 %} + {% for error_content, similar_error_knowledge in queried_similar_error_knowledge %} + --------------Factor information to similar error ({{error_content}}):--------------- + {{ similar_error_knowledge[0].target_task.get_factor_information() }} + =====Code with similar error ({{error_content}}):===== + {{ similar_error_knowledge[0].implementation.code }} + =====Success code to former code with similar error ({{error_content}}):===== + {{ similar_error_knowledge[1].implementation.code }} + {% endfor %} + {% endif %} + diff --git a/rdagent/factor_implementation/share_modules/utils.py b/rdagent/factor_implementation/share_modules/utils.py new file mode 100644 index 00000000..0f22aa4f --- /dev/null +++ b/rdagent/factor_implementation/share_modules/utils.py @@ -0,0 +1,51 @@ +from pathlib import Path + +import pandas as pd + +# render it with jinja +from jinja2 import Template + +from factor_implementation.share_modules.conf import FIS + +TPL = """ +{{file_name}} +```{{type_desc}} +{{content}} +```` +""" +# Create a Jinja template from the string +JJ_TPL = Template(TPL) + + +def get_data_folder_intro(): + """Direclty get the info of the data folder. + It is for preparing prompting message. + """ + content_l = [] + for p in Path(FIS.file_based_execution_data_folder).iterdir(): + if p.name.endswith(".h5"): + df = pd.read_hdf(p) + # get df.head() as string with full width + pd.set_option("display.max_columns", None) # or 1000 + pd.set_option("display.max_rows", None) # or 1000 + pd.set_option("display.max_colwidth", None) # or 199 + rendered = JJ_TPL.render( + file_name=p.name, + type_desc="generated by `pd.read_hdf(filename).head()`", + content=df.head().to_string(), + ) + content_l.append(rendered) + elif p.name.endswith(".md"): + with open(p) as f: + content = f.read() + rendered = JJ_TPL.render( + file_name=p.name, + type_desc="markdown", + content=content, + ) + content_l.append(rendered) + else: + raise NotImplementedError( + f"file type {p.name} is not supported. Please implement its description function.", + ) + return "\n ----------------- file spliter -------------\n".join(content_l) diff --git a/rdagent/knowledge_management/graph.py b/rdagent/knowledge_management/graph.py new file mode 100644 index 00000000..1031df60 --- /dev/null +++ b/rdagent/knowledge_management/graph.py @@ -0,0 +1,490 @@ +import pickle +import random +from collections import deque +from pathlib import Path +from typing import Dict, List, Tuple, Union + +from finco.llm import APIBackend +from finco.vector_base import KnowledgeMetaData, PDVectorBase, VectorBase, cosine + +Node = KnowledgeMetaData + + +class UndirectedNode(Node): + def __init__(self, content: str = "", label: str = "", embedding=None): + super().__init__(content, label, embedding) + self.neighbors = set() + + def add_neighbor(self, node): + self.neighbors.add(node) + node.neighbors.add(self) + + def remove_neighbor(self, node): + if node in self.neighbors: + self.neighbors.remove(node) + node.neighbors.remove(self) + + def get_neighbors(self): + return self.neighbors + + def __str__(self): + return ( + f"UndirectedNode(id={self.id}, label={self.label}, content={self.content[:100]}, " + f"neighbors={self.neighbors})" + ) + + def __repr__(self): + return ( + f"UndirectedNode(id={self.id}, label={self.label}, content={self.content[:100]}, " + f"neighbors={self.neighbors})" + ) + + +class Graph: + """ + base Graph class for Knowledge Graph Search + """ + + def __init__(self, path: Union[str, Path] = None): + self.path = path + self.nodes = {} + + def size(self): + return len(self.nodes) + + def get_node(self, node_id: str) -> Node: + node = self.nodes.get(node_id) + return node + + def add_node(self, **kwargs): + raise NotImplementedError + + def get_all_nodes(self) -> List: + return list(self.nodes.values()) + + def get_all_nodes_by_label_list(self, label_list: List[str]) -> List: + node_list = [] + for node in self.nodes.values(): + if node.label in label_list: + node_list.append(node) + return node_list + + def find_node(self, content: str, label: str): + for node in self.nodes.values(): + if node.content == content and node.label == label: + return node + + @classmethod + def load(cls, path: Union[str, Path]): + """use pickle as the default load method""" + path = path if isinstance(path, Path) else Path(path) + if not path.exists(): + return Graph(path=path) + + with open(path, "rb") as f: + return pickle.load(f) + + def save(self, path: Union[str, Path], **kwargs): + """use pickle as the default save method""" + Path.mkdir(path.parent, exist_ok=True) + with open(path, "wb") as f: + pickle.dump(self, f) + + @staticmethod + def batch_embedding(nodes: List[Node]): + contents = [node.content for node in nodes] + # openai create embedding API input's max length is 16 + size = 16 + embeddings = [] + for i in range(0, len(contents), size): + embeddings.extend( + APIBackend().create_embedding(input_content=contents[i : i + size]) + ) + + assert len(nodes) == len( + embeddings + ), "nodes' length must equals embeddings' length" + for node, embedding in zip(nodes, embeddings): + node.embedding = embedding + return nodes + + def __str__(self): + return f"Graph(nodes={self.nodes})" + + +class UndirectedGraph(Graph): + """ + Undirected Graph which edges have no relationship + """ + + def __init__(self, path: Union[str, Path] = None): + super().__init__(path=path) + self.vector_base: VectorBase = PDVectorBase() + + def __str__(self): + return f"UndirectedGraph(nodes={self.nodes})" + + def add_node( + self, + node: UndirectedNode, + neighbor: UndirectedNode = None, + same_node_threshold=0.95, + ): + """ + add node and neighbor to the Graph + Parameters + ---------- + same_node_threshold: 0.95 is an empirical value. When two strings only differ in case, the similarity is greater + than 0.95. + node + neighbor + + Returns + ------- + + """ + if self.get_node(node.id): + node = self.get_node(node.id) + elif self.find_node(content=node.content, label=node.label): + node = self.find_node(content=node.content, label=node.label) + else: + # same_node = self.semantic_search(node=node.content, similarity_threshold=same_node_threshold, topk_k=1) + # if len(same_node): + # node = same_node[0] + # else: + node.create_embedding() + self.vector_base.add(document=node) + self.nodes.update({node.id: node}) + + if neighbor is not None: + if self.get_node(neighbor.id): + neighbor = self.get_node(neighbor.id) + elif self.find_node(content=neighbor.content, label=node.label): + neighbor = self.find_node(content=neighbor.content, label=node.label) + else: + # same_node = self.semantic_search(node=neighbor.content, + # similarity_threshold=same_node_threshold, topk_k=1) + # if len(same_node): + # neighbor = same_node[0] + # else: + neighbor.create_embedding() + self.vector_base.add(document=neighbor) + self.nodes.update({neighbor.id: neighbor}) + + node.add_neighbor(neighbor) + + @classmethod + def load(cls, path: Union[str, Path]): + """use pickle as the default load method""" + path = path if isinstance(path, Path) else Path(path) + if not path.exists(): + return UndirectedGraph(path=path) + + with open(path, "rb") as f: + return pickle.load(f) + + def add_nodes(self, node: UndirectedNode, neighbors: List[UndirectedNode]): + if not len(neighbors): + self.add_node(node) + else: + for neighbor in neighbors: + self.add_node(node, neighbor=neighbor) + + def get_node(self, node_id: str) -> UndirectedNode: + node = self.nodes.get(node_id) + return node + + def get_node_by_content(self, content: str) -> Union[UndirectedNode, None]: + """ + Get node by semantic distance + Parameters + ---------- + content + + Returns + ------- + + """ + if content == "Model": + pass + match = self.semantic_search(node=content, similarity_threshold=0.999) + if len(match): + return match[0] + else: + return None + + def get_nodes_within_steps( + self, + start_node: UndirectedNode, + steps: int = 1, + constraint_labels: List[str] = None, + block: bool = False, + ) -> List[UndirectedNode]: + """ + Returns the nodes in the graph whose distance from node is less than or equal to step + """ + visited = set() + queue = deque([(start_node, 0)]) + result = [] + + while queue: + node, current_steps = queue.popleft() + + if current_steps > steps: + break + + if node not in visited: + visited.add(node) + result.append(node) + + for neighbor in sorted( + list(self.get_node(node.id).neighbors), key=lambda x: x.content + ): # to make sure the result is deterministic + if neighbor not in visited: + if not (block and neighbor.label not in constraint_labels): + queue.append((neighbor, current_steps + 1)) + + if constraint_labels: + result = [node for node in result if node.label in constraint_labels] + if start_node in result: + result.pop(result.index(start_node)) + return result + + def get_nodes_intersection( + self, + nodes: List[UndirectedNode], + steps: int = 1, + constraint_labels: List[str] = None, + ) -> List[UndirectedNode]: + """ + Get the intersection with nodes connected within n steps of nodes + + Parameters + ---------- + nodes + steps + constraint_labels + + Returns + ------- + + """ + assert len(nodes) >= 2, "nodes length must >=2" + intersection = None + + for node in nodes: + if intersection is None: + intersection = self.get_nodes_within_steps( + node, steps=steps, constraint_labels=constraint_labels + ) + intersection = self.intersection( + nodes1=intersection, + nodes2=self.get_nodes_within_steps( + node, steps=steps, constraint_labels=constraint_labels + ), + ) + + return intersection + + def semantic_search( + self, + node: Union[UndirectedNode, str], + similarity_threshold: float = 0.0, + topk_k: int = 5, + ) -> List[UndirectedNode]: + """ + semantic search by node's embedding + + Parameters + ---------- + topk_k + node + similarity_threshold: Returns nodes whose distance score from the input node is greater than similarity_threshold + + Returns + ------- + + """ + if isinstance(node, str): + node = UndirectedNode(content=node) + docs, scores = self.vector_base.search( + content=node.content, + topk_k=topk_k, + similarity_threshold=similarity_threshold, + ) + nodes = [self.get_node(doc.id) for doc in docs] + return nodes + + def clear(self): + self.nodes.clear() + self.vector_base: VectorBase = PDVectorBase() + + def query_by_node( + self, + node: UndirectedNode, + step: int = 1, + constraint_labels: List[str] = None, + constraint_node: UndirectedNode = None, + constraint_distance: float = 0, + block: bool = False, + ) -> List[UndirectedNode]: + """ + search graph by connection, return empty list if nodes' chain without node near to constraint_node + Parameters + ---------- + node + step + constraint_labels + constraint_node + constraint_distance + block: despite the start node, the search can only flow through the constraint_label type nodes + + Returns + ------- + + """ + nodes = self.get_nodes_within_steps( + start_node=node, + steps=step, + constraint_labels=constraint_labels, + block=block, + ) + if constraint_node is not None: + for n in nodes: + if self.cal_distance(n, constraint_node) > constraint_distance: + return nodes + return [] + return nodes + + def query_by_content( + self, + content: Union[str, List[str]], + topk_k: int = 5, + step: int = 1, + constraint_labels: List[str] = None, + constraint_node: UndirectedNode = None, + similarity_threshold: float = 0.0, + constraint_distance: float = 0, + block: bool = False, + ) -> List[UndirectedNode]: + """ + search graph by content similarity and connection relationship, return empty list if nodes' chain without node + near to constraint_node + + Parameters + ---------- + constraint_distance : float the distance between the node and the constraint_node + content : Union[str, List[str]] + topk_k: the upper number of output for each query, if the number of fit nodes is less than topk_k, return all fit nodes's content + step : the maximum distance between the start node and the result node + constraint_labels : the type of nodes that the search can only flow through + constraint_node : the node that the search can only flow through + similarity_threshold : the similarity threshold of the content + block: despite the start node, the search can only flow through the constraint_label type nodes + + Returns + ------- + + """ + + if isinstance(content, str): + content = [content] + + res_list = [] + for query in content: + similar_nodes = self.semantic_search( + content=query, topk_k=topk_k, similarity_threshold=similarity_threshold + ) + + connected_nodes = [] + for node in similar_nodes: + graph_query_node_res = self.query_by_node( + node, + step=step, + constraint_labels=constraint_labels, + constraint_node=constraint_node, + constraint_distance=constraint_distance, + block=block, + ) + connected_nodes.extend( + [ + node + for node in graph_query_node_res + if node not in connected_nodes + ] + ) + if len(connected_nodes) >= topk_k: + break + + res_list.extend( + [node for node in connected_nodes[:topk_k] if node not in res_list] + ) + return res_list + + @staticmethod + def intersection(nodes1: List[UndirectedNode], nodes2: List[UndirectedNode]): + intersection = [node for node in nodes1 if node in nodes2] + return intersection + + @staticmethod + def different(nodes1: List[UndirectedNode], nodes2: List[UndirectedNode]): + difference = list(set(nodes1).symmetric_difference(set(nodes2))) + return difference + + @staticmethod + def cal_distance(node1: UndirectedNode, node2: UndirectedNode): + distance = cosine(node1.embedding, node2.embedding) + return distance + + @staticmethod + def filter_label(nodes: List[UndirectedNode], labels: List[str]): + nodes = [node for node in nodes if node.label in labels] + return nodes + + +def graph_to_edges(graph: Dict[str, List[str]]): + edges = [] + + for node, neighbors in graph.items(): + for neighbor in neighbors: + if [node, neighbor] in edges or [neighbor, node] in edges: + continue + edges.append([node, neighbor]) + + return edges + + +def assign_random_coordinate_to_node( + nodes: List, scope: float = 1.0, origin: Tuple = (0.0, 0.0) +) -> Dict: + coordinates = {} + + for node in nodes: + x = random.uniform(0, scope) + origin[0] + y = random.uniform(0, scope) + origin[1] + coordinates[node] = (x, y) + + return coordinates + + +def assign_isometric_coordinate_to_node( + nodes: List, x_step: float = 1.0, x_origin: float = 0.0, y_origin: float = 0.0 +) -> Dict: + coordinates = {} + + for i, node in enumerate(nodes): + x = x_origin + i * x_step + y = y_origin + coordinates[node] = (x, y) + + return coordinates + + +def curly_node_coordinate( + coordinates: Dict, center_y: float = 1.0, r: float = 1.0 +) -> Dict: + # noto: this method can only curly < 90 degree, and the curl line is circle. + # the original funtion is: x**2 + (y-m)**2 = r**2 + for node, coordinate in coordinates.items(): + coordinate[1] = center_y + (r**2 - coordinate[0] ** 2) ** 0.5 + return coordinates diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py index e69de29b..73eecf8f 100644 --- a/rdagent/oai/llm_utils.py +++ b/rdagent/oai/llm_utils.py @@ -0,0 +1,706 @@ +import datetime +import hashlib +import json +import multiprocessing +import os +import re +import sqlite3 +import ssl +import time +import urllib.request +import uuid +from copy import deepcopy +from pathlib import Path +from typing import List, Optional, Tuple, Union + +import numpy as np +import tiktoken +from scipy.spatial.distance import cosine + +from core.conf import FincoSettings as Config +from core.log import FinCoLog, LogColors +from core.utils import SingletonBaseClass + +DEFAULT_QLIB_DOT_PATH = Path("./") + + +def md5_hash(input_string): + md5 = hashlib.md5() + input_bytes = input_string.encode("utf-8") + md5.update(input_bytes) + hashed_string = md5.hexdigest() + return hashed_string + + +try: + import openai +except ImportError: + FinCoLog().warning("openai is not installed.") + +try: + from llama import Llama +except ImportError: + FinCoLog().warning("llama is not installed.") + + +class ConvManager: + """ + This is a conversation manager of LLM + It is for convenience of exporting conversation for debugging. + """ + + def __init__( + self, + path: Union[Path, str] = DEFAULT_QLIB_DOT_PATH / "llm_conv", + recent_n: int = 10, + ) -> None: + self.path = Path(path) + self.path.mkdir(parents=True, exist_ok=True) + self.recent_n = recent_n + + def _rotate_files(self): + pairs = [] + for f in self.path.glob("*.json"): + m = re.match(r"(\d+).json", f.name) + if m is not None: + n = int(m.group(1)) + pairs.append((n, f)) + pass + pairs.sort(key=lambda x: x[0]) + for n, f in pairs[: self.recent_n][::-1]: + if Path(self.path / f"{n+1}.json").exists(): + os.remove(self.path / f"{n+1}.json") + f.rename(self.path / f"{n+1}.json") + + def append(self, conv: Tuple[list, str]): + self._rotate_files() + json.dump(conv, open(self.path / "0.json", "w")) + # TODO: reseve line breaks to make it more convient to edit file directly. + + +class SQliteLazyCache(SingletonBaseClass): + def __init__(self, cache_location) -> None: + super().__init__() + self.cache_location = cache_location + db_file_exist = os.path.exists(cache_location) + self.conn = sqlite3.connect(cache_location) + self.c = self.conn.cursor() + if not db_file_exist: + self.c.execute( + """ + CREATE TABLE chat_cache ( + md5_key TEXT PRIMARY KEY, + chat TEXT + ) + """ + ) + self.c.execute( + """ + CREATE TABLE embedding_cache ( + md5_key TEXT PRIMARY KEY, + embedding TEXT + ) + """ + ) + self.conn.commit() + + def chat_get(self, key): + md5_key = md5_hash(key) + self.c.execute("SELECT chat FROM chat_cache WHERE md5_key=?", (md5_key,)) + result = self.c.fetchone() + if result is None: + return None + else: + return result[0] + + def embedding_get(self, key): + md5_key = md5_hash(key) + self.c.execute("SELECT embedding FROM embedding_cache WHERE md5_key=?", (md5_key,)) + result = self.c.fetchone() + if result is None: + return None + else: + return json.loads(result[0]) + + def chat_set(self, key, value): + md5_key = md5_hash(key) + self.c.execute( + "INSERT OR REPLACE INTO chat_cache (md5_key, chat) VALUES (?, ?)", + (md5_key, value), + ) + self.conn.commit() + + def embedding_set(self, content_to_embedding_dict): + for key, value in content_to_embedding_dict.items(): + md5_key = md5_hash(key) + self.c.execute( + "INSERT OR REPLACE INTO embedding_cache (md5_key, embedding) VALUES (?, ?)", + (md5_key, json.dumps(value)), + ) + self.conn.commit() + + +class SessionChatHistoryCache(SingletonBaseClass): + def __init__(self) -> None: + """load all history conversation json file from self.session_cache_location""" + self.cfg = Config() + self.session_cache_location = Path(self.cfg.session_cache_folder_location) + self.cache = {} + if not self.session_cache_location.exists(): + FinCoLog.warning(f"Directory {self.session_cache_location} does not exist.") + self.session_cache_location.mkdir(parents=True, exist_ok=True) + json_files = [f for f in self.session_cache_location.iterdir() if f.suffix == ".json"] + if not json_files: + FinCoLog.info(f"No JSON files found in {self.session_cache_location}.") + for file_path in json_files: + conversation_id = file_path.stem + with file_path.open("r") as f: + conversation_content = json.load(f) + self.cache[conversation_id] = conversation_content["content"] + + def message_get(self, conversation_id: str): + return self.cache.get(conversation_id, []) + + def message_set(self, conversation_id, message_value): + self.cache[conversation_id] = message_value + conversation_path = self.session_cache_location / conversation_id + conversation_path = conversation_path.with_suffix(".json") + current_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + with open(conversation_path, "w") as f: + json.dump({"content": message_value, "last_modified_time": current_time}, f) + + +class ChatSession: + def __init__(self, api_backend, conversation_id=None, system_prompt=None): + self.conversation_id = str(uuid.uuid4()) if conversation_id is None else conversation_id + self.cfg = Config() + self.system_prompt = system_prompt if system_prompt is not None else self.cfg.default_system_prompt + self.api_backend = api_backend + + def build_chat_completion_message(self, user_prompt, **kwargs): + history_message = SessionChatHistoryCache().message_get(self.conversation_id) + messages = history_message + if not messages: + messages.append({"role": "system", "content": self.system_prompt}) + messages.append( + { + "role": "user", + "content": user_prompt, + } + ) + return messages + + def build_chat_completion_message_and_calculate_token(self, user_prompt, **kwargs): + messages = self.build_chat_completion_message(user_prompt, **kwargs) + return self.api_backend.calculate_token_from_messages(messages) + + def build_chat_completion(self, user_prompt, **kwargs): + """ + this function is to build the session messages + user prompt should always be provided + """ + messages = self.build_chat_completion_message(user_prompt, **kwargs) + + response = self.api_backend._try_create_chat_completion_or_embedding( + messages=messages, chat_completion=True, **kwargs + ) + messages.append( + { + "role": "assistant", + "content": response, + } + ) + SessionChatHistoryCache().message_set(self.conversation_id, messages) + return response + + def get_conversation_id(self): + return self.conversation_id + + def dispaly_history(): + # TODO: Realize a beautiful presentation format for history messages + pass + + +class APIBackend: + def __init__( + self, + *, + chat_api_key=None, + chat_model=None, + chat_api_base=None, + chat_api_version=None, + embedding_api_key=None, + embedding_model=None, + embedding_api_base=None, + embedding_api_version=None, + use_chat_cache=None, + dump_chat_cache=None, + use_embedding_cache=None, + dump_embedding_cache=None, + ) -> None: + self.cfg = Config() + if self.cfg.use_llama2: + self.generator = Llama.build( + ckpt_dir=self.cfg.llama2_ckpt_dir, + tokenizer_path=self.cfg.llama2_tokenizer_path, + max_seq_len=self.cfg.max_tokens, + max_batch_size=self.cfg.llams2_max_batch_size, + ) + self.encoder = None + elif self.cfg.use_gcr_endpoint: + if self.cfg.gcr_endpoint_type == "llama2_70b": + self.gcr_endpoinpt_key = self.cfg.llama2_70b_endpoint_key + self.gcr_endpoint_deployment = self.cfg.llama2_70b_endpoint_deployment + self.gcr_endpoint = self.cfg.llama2_70b_endpoint + elif self.cfg.gcr_endpoint_type == "llama3_70b": + self.gcr_endpoinpt_key = self.cfg.llama3_70b_endpoint_key + self.gcr_endpoint_deployment = self.cfg.llama3_70b_endpoint_deployment + self.gcr_endpoint = self.cfg.llama3_70b_endpoint + elif self.cfg.gcr_endpoint_type == "phi2": + self.gcr_endpoinpt_key = self.cfg.phi2_endpoint_key + self.gcr_endpoint_deployment = self.cfg.phi2_endpoint_deployment + self.gcr_endpoint = self.cfg.phi2_endpoint + elif self.cfg.gcr_endpoint_type == "phi3_4k": + self.gcr_endpoinpt_key = self.cfg.phi3_4k_endpoint_key + self.gcr_endpoint_deployment = self.cfg.phi3_4k_endpoint_deployment + self.gcr_endpoint = self.cfg.phi3_4k_endpoint + elif self.cfg.gcr_endpoint_type == "phi3_128k": + self.gcr_endpoinpt_key = self.cfg.phi3_128k_endpoint_key + self.gcr_endpoint_deployment = self.cfg.phi3_128k_endpoint_deployment + self.gcr_endpoint = self.cfg.phi3_128k_endpoint + else: + raise ValueError(f"Invalid gcr_endpoint_type: {self.cfg.gcr_endpoint_type}") + self.headers = { + "Content-Type": "application/json", + "Authorization": ("Bearer " + self.gcr_endpoinpt_key), + "azureml-model-deployment": self.gcr_endpoint_deployment, + } + # self.gcr_endpoint = self.cfg.llama2_endpoint + self.gcr_endpoint_temperature = self.cfg.gcr_endpoint_temperature + self.gcr_endpoint_top_p = self.cfg.gcr_endpoint_top_p + self.gcr_endpoint_do_sample = self.cfg.gcr_endpoint_do_sample + self.gcr_endpoint_max_token = self.cfg.gcr_endpoint_max_token + if not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None): + ssl._create_default_https_context = ssl._create_unverified_context + self.encoder = None + else: + self.use_azure = self.cfg.use_azure + + self.chat_api_key = self.cfg.chat_openai_api_key if chat_api_key is None else chat_api_key + self.chat_model = self.cfg.chat_model if chat_model is None else chat_model + self.encoder = tiktoken.encoding_for_model(self.chat_model) + self.chat_api_base = self.cfg.chat_azure_api_base if chat_api_base is None else chat_api_base + self.chat_api_version = self.cfg.chat_azure_api_version if chat_api_version is None else chat_api_version + self.chat_stream = self.cfg.chat_stream + self.chat_seed = self.cfg.chat_seed + + self.embedding_api_key = ( + self.cfg.embedding_openai_api_key if embedding_api_key is None else embedding_api_key + ) + self.embedding_model = self.cfg.embedding_model if embedding_model is None else embedding_model + self.embedding_api_base = ( + self.cfg.embedding_azure_api_base if embedding_api_base is None else embedding_api_base + ) + self.embedding_api_version = ( + self.cfg.embedding_azure_api_version if embedding_api_version is None else embedding_api_version + ) + + if self.use_azure: + self.chat_client = openai.AzureOpenAI( + api_key=self.chat_api_key, + api_version=self.chat_api_version, + azure_endpoint=self.chat_api_base, + ) + self.embedding_client = openai.AzureOpenAI( + api_key=self.embedding_api_key, + api_version=self.embedding_api_version, + azure_endpoint=self.embedding_api_base, + ) + else: + self.chat_client = openai.OpenAI(api_key=self.chat_api_key) + self.embedding_client = openai.OpenAI(api_key=self.embedding_api_key) + + self.dump_chat_cache = self.cfg.dump_chat_cache if dump_chat_cache is None else dump_chat_cache + self.use_chat_cache = self.cfg.use_chat_cache if use_chat_cache is None else use_chat_cache + self.dump_embedding_cache = ( + self.cfg.dump_embedding_cache if dump_embedding_cache is None else dump_embedding_cache + ) + self.use_embedding_cache = self.cfg.use_embedding_cache if use_embedding_cache is None else use_embedding_cache + if self.dump_chat_cache or self.use_chat_cache or self.dump_embedding_cache or self.use_embedding_cache: + self.cache_file_location = self.cfg.prompt_cache_path + self.cache = SQliteLazyCache(self.cache_file_location) + + # transfer the config to the class if the config is not supposed to change during the runtime + self.use_llama2 = self.cfg.use_llama2 + self.use_gcr_endpoint = self.cfg.use_gcr_endpoint + self.retry_wait_seconds = self.cfg.retry_wait_seconds + + def build_chat_session(self, conversation_id=None, session_system_prompt=None): + """ + conversation_id is a 256-bit string created by uuid.uuid4() and is also + the file name under session_cache_folder/ for each conversation + """ + session = ChatSession(self, conversation_id, session_system_prompt) + return session + + def build_messages( + self, + user_prompt, + system_prompt=None, + former_messages=[], + shrink_multiple_break=False, + ): + """build the messages to avoid implementing several redundant lines of code""" + # shrink multiple break will recursively remove multiple breaks(more than 2) + if shrink_multiple_break: + while "\n\n\n" in user_prompt: + user_prompt = user_prompt.replace("\n\n\n", "\n\n") + while "\n\n\n" in system_prompt: + system_prompt = system_prompt.replace("\n\n\n", "\n\n") + system_prompt = self.cfg.default_system_prompt if system_prompt is None else system_prompt + messages = [ + { + "role": "system", + "content": system_prompt, + } + ] + messages.extend(former_messages[-1 * self.cfg.max_past_message_include :]) + messages.append( + { + "role": "user", + "content": user_prompt, + } + ) + return messages + + def build_messages_and_create_chat_completion( + self, + user_prompt, + system_prompt=None, + former_messages=[], + shrink_multiple_break=False, + chat_cache_prefix="", + **kwargs, + ): + messages = self.build_messages(user_prompt, system_prompt, former_messages, shrink_multiple_break) + response = self._try_create_chat_completion_or_embedding( + messages=messages, + chat_completion=True, + chat_cache_prefix=chat_cache_prefix, + **kwargs, + ) + + # if self.debug_mode: + # ConvManager().append((messages, response)) + return response + + def create_embedding(self, input_content, **kwargs): + if isinstance(input_content, str): + input_content_list = [input_content] + elif isinstance(input_content, list): + input_content_list = input_content + resp = self._try_create_chat_completion_or_embedding( + input_content_list=input_content_list, embedding=True, **kwargs + ) + if isinstance(input_content, str): + return resp[0] + elif isinstance(input_content, list): + return resp + + def _create_chat_completion_auto_continue(self, messages, **kwargs): + """ + this function is to call the chat completion function and automatically continue the conversation if the finish_reason is length + # TODO: this function only continue once, maybe need to continue more than once in the future + """ + response, finish_reason = self._create_chat_completion_inner_function(messages=messages, **kwargs) + + if finish_reason == "length": + new_message = deepcopy(messages) + new_message.append({"role": "assistant", "content": response}) + new_message.append( + { + "role": "user", + "content": "continue the former output with no overlap", + } + ) + new_response, finish_reason = self._create_chat_completion_inner_function(messages=new_message, **kwargs) + return response + new_response + else: + return response + + def _try_create_chat_completion_or_embedding(self, max_retry=10, chat_completion=False, embedding=False, **kwargs): + assert not (chat_completion and embedding), "chat_completion and embedding cannot be True at the same time" + max_retry = self.cfg.max_retry if self.cfg.max_retry is not None else max_retry + for i in range(max_retry): + try: + if embedding: + response = self._create_embedding_inner_function(**kwargs) + return response + elif chat_completion: + response = self._create_chat_completion_auto_continue(**kwargs) + return response + except Exception as e: + print(e) + print(f"Retrying {i+1}th time...") + if ( + isinstance(e, openai.BadRequestError) + and r"'messages' must contain the word 'json' in some form" in e.message + ): + kwargs["add_json_in_prompt"] = True + elif isinstance(e, openai.BadRequestError) and embedding and "maximum context length" in e.message: + for index in range(len(kwargs["input_content_list"])): + kwargs["input_content_list"][index] = kwargs["input_content_list"][index][ + : len(kwargs["input_content_list"][index]) // 2 + ] + else: + time.sleep(self.retry_wait_seconds) + continue + raise Exception(f"Failed to create chat completion after {max_retry} retries.") + + def _create_embedding_inner_function(self, input_content_list, **kwargs): + content_to_embedding_dict = {} + filtered_input_content_list = [] + if self.use_embedding_cache: + for content in input_content_list: + cache_result = self.cache.embedding_get(content) + if cache_result is not None: + content_to_embedding_dict[content] = cache_result + else: + filtered_input_content_list.append(content) + else: + filtered_input_content_list = input_content_list + + if len(filtered_input_content_list) > 0: + if self.use_azure: + response = self.embedding_client.embeddings.create( + model=self.embedding_model, + input=filtered_input_content_list, + ) + else: + response = self.embedding_client.embeddings.create( + model=self.embedding_model, + input=filtered_input_content_list, + ) + for index, data in enumerate(response.data): + content_to_embedding_dict[filtered_input_content_list[index]] = data.embedding + + if self.dump_embedding_cache: + self.cache.embedding_set(content_to_embedding_dict) + resp = [content_to_embedding_dict[content] for content in input_content_list] + return resp + + def _build_messages(self, messages): + log_messages = "" + for m in messages: + log_messages += ( + f"\n{LogColors.MAGENTA}{LogColors.BOLD}Role:{LogColors.END}" + + f"{LogColors.CYAN}{m['role']}{LogColors.END}\n" + + f"{LogColors.MAGENTA}{LogColors.BOLD}Content:{LogColors.END} " + + f"{LogColors.CYAN}{m['content']}{LogColors.END}\n" + ) + return log_messages + + def log_messages(self, messages): + if self.cfg.log_llm_chat_content: + FinCoLog().info(self._build_messages(messages)) + + def log_response(self, response=None, stream=False): + if self.cfg.log_llm_chat_content: + if stream: + FinCoLog().info(f"\n{LogColors.CYAN}Response:{LogColors.END}") + else: + FinCoLog().info(f"\n{LogColors.CYAN}Response:{response}{LogColors.END}") + + def _create_chat_completion_inner_function( + self, + messages, + temperature: float = None, + max_tokens: Optional[int] = None, + chat_cache_prefix="", + json_mode=False, + add_json_in_prompt=False, + frequency_penalty=None, + presence_penalty=None, + ) -> str: + self.log_messages(messages) + # TODO: fail to use loguru adaptor due to stream response + input_content_json = json.dumps(messages) + input_content_json = ( + chat_cache_prefix + input_content_json + ) # FIXME this is a hack to make sure the cache represents the round index + if self.use_chat_cache: + cache_result = self.cache.chat_get(input_content_json) + if cache_result is not None: + return cache_result, None + + if temperature is None: + temperature = self.cfg.chat_temperature + if max_tokens is None: + max_tokens = self.cfg.chat_max_tokens + if frequency_penalty is None: + frequency_penalty = self.cfg.chat_frequency_penalty + if presence_penalty is None: + presence_penalty = self.cfg.chat_presence_penalty + + finish_reason = None + if self.use_llama2: + response = self.generator.chat_completion( + messages, # type: ignore + max_gen_len=max_tokens, + temperature=temperature, + ) + resp = response[0]["generation"]["content"] + self.log_response(resp) + elif self.use_gcr_endpoint: + body = str.encode( + json.dumps( + { + "input_data": { + "input_string": messages, + "parameters": { + "temperature": self.gcr_endpoint_temperature, + "top_p": self.gcr_endpoint_top_p, + "do_sample": self.gcr_endpoint_do_sample, + "max_new_tokens": self.gcr_endpoint_max_token, + }, + } + } + ) + ) + + req = urllib.request.Request(self.gcr_endpoint, body, self.headers) + response = urllib.request.urlopen(req) + resp = json.loads(response.read().decode())["output"] + self.log_response(resp) + else: + if self.use_azure: + if json_mode: + if add_json_in_prompt: + for message in messages[::-1]: + message["content"] = message["content"] + "\nPlease respond in json format." + if message["role"] == "system": + break + response = self.chat_client.chat.completions.create( + model=self.chat_model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + response_format={"type": "json_object"}, + stream=self.chat_stream, + seed=self.chat_seed, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + ) + else: + response = self.chat_client.chat.completions.create( + model=self.chat_model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + stream=self.chat_stream, + seed=self.chat_seed, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + ) + else: + response = self.chat_client.chat.completions.create( + model=self.chat_model, + messages=messages, + stream=self.chat_stream, + seed=self.chat_seed, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + ) + if self.chat_stream: + self.log_response(stream=True) + resp = "" + for chunk in response: + content = ( + chunk.choices[0].delta.content + if len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None + else "" + ) + if self.cfg.log_llm_chat_content: + print(LogColors.CYAN + content, end="") + resp += content + if len(chunk.choices) > 0 and chunk.choices[0].finish_reason is not None: + finish_reason = chunk.choices[0].finish_reason + else: + resp = response.choices[0].message.content + finish_reason = response.choices[0].finish_reason + self.log_response(resp) + if json_mode: + json.loads(resp) + if self.dump_chat_cache: + self.cache.chat_set(input_content_json, resp) + # TODO: fail to use loguru adaptor due to stream response + return resp, finish_reason + + def calculate_token_from_messages(self, messages): + if self.use_llama2 or self.use_gcr_endpoint: + FinCoLog().warning("num_tokens_from_messages() is not implemented for model llama2.") + return 0 # TODO implement this function for llama2 + + if "gpt4" in self.chat_model or "gpt-4" in self.chat_model: + tokens_per_message = 3 + tokens_per_name = 1 + else: + tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n + tokens_per_name = -1 # if there's a name, the role is omitted + num_tokens = 0 + for message in messages: + num_tokens += tokens_per_message + for key, value in message.items(): + num_tokens += len(self.encoder.encode(value)) + if key == "name": + num_tokens += tokens_per_name + num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> + return num_tokens + + def build_messages_and_calculate_token( + self, + user_prompt, + system_prompt, + former_messages=[], + shrink_multiple_break=False, + ): + messages = self.build_messages(user_prompt, system_prompt, former_messages, shrink_multiple_break) + return self.calculate_token_from_messages(messages) + + +def calculate_embedding_process(str_list): + return APIBackend().create_embedding(str_list) + + +def create_embedding_with_multiprocessing(str_list, slice_count=50, nproc=8): + embeddings = [] + + pool = multiprocessing.Pool(nproc) + result_list = [] + for index in range(0, len(str_list), slice_count): + result_list.append(pool.apply_async(calculate_embedding_process, (str_list[index : index + slice_count],))) + + pool.close() + pool.join() + + for res in result_list: + embeddings.extend(res.get()) + return embeddings + + +def calculate_embedding_distance_between_str_list(source_str_list: List, target_str_list: List): + if len(source_str_list) == 0 or len(target_str_list) == 0: + return [[]] + + embeddings = create_embedding_with_multiprocessing(source_str_list + target_str_list, slice_count=50, nproc=8) + source_embeddings = embeddings[: len(source_str_list)] + target_embeddings = embeddings[len(source_str_list) :] + + source_embeddings_np = np.array(source_embeddings) + target_embeddings_np = np.array(target_embeddings) + + source_embeddings_np = source_embeddings_np / np.linalg.norm(source_embeddings_np, axis=1, keepdims=True) + target_embeddings_np = target_embeddings_np / np.linalg.norm(target_embeddings_np, axis=1, keepdims=True) + similarity_matrix = np.dot(source_embeddings_np, target_embeddings_np.T) + + return similarity_matrix.tolist()