Readme, auto detect data type, more check

Mr-SGXXX · Mar 4, 2024 · 8d7020b · 8d7020b
1 parent 2e34fcb
commit 8d7020b
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,66 @@
+# pyerm (Python Experiment Record Manager)
+This project is an experiment record manager for python based on SQLite DMS, which can help you efficiently save your experiment settings and results for latter analysis. 
+
+*In the current version, all operations will be performed locally.*
+
+# Introduction
+This project is used to save the settings and results of any experiment consists of three parts: method, data, task. 
+
+Besides, the basic information and detail information of the experiment will also be recorded.
+
+All data you want can be efficiently saved by API provided without knowing the detail implement, but I suggest reading the table introduction for further dealing with the records. 
+
+## Workflow Introduction
+### Table Define & Init
+Before starting the experiment, you need to init the tables you need for the experiment by three init function: `data_init()`, `method_init()`, `task_init()`.
+
+ You need to input the name and experiment parameter for the first two. The function can automatically detect the data type, and they will create the table if not exist. If you want to define the DMS type yourself, you can input a `param_def_dict` to these function, whose key means column name, and value means column SQL type define, like `{"people", "TEXT DEFAULT NULL"}`. 
+
+### Experiment 
+
+The experiment recorder mainly consists of four parts, `experiment_start()`, `experiment_over()`, `experiment_failed()`, `detail_update()`. From the name of these function, you can easily know where and how to use them.
+
+`experiment_start()` saves the basic experiment information before experiment formally starts and will set the experiment status to running.
+
+`experiment_over()` saves the experiment results after experiment ends and will set the experiment status to over.
+
+`experiment_failed()` saves the reason why experiment failed and will set the experiment status to failed.
+
+`detail_update()` saves the intermediate results. It's optional, and if you never use it and don't manually set the define dict, the detail table may not be created.
+
+## Table Introduction
+
+### Experiment Table
+All experiments' basic information will be recorded in the experiment_list table. It contains the description of the method, the method (with its setting id) & data (with its setting id) & task, the start & end time of the experiment, useful & total time cost, tags, experimenters, failure reason and the experiment status, each experiment is identified by the experiment id.
+
+### Method Table
+Each Method Table is identified by its corresponding method name, and any different method will be assigned a different table for saving its different parameter setting, such as method-specific hyper-parameters, etc. The table is used to save different parameter for every method.
+
+The only necessary column for method table is the method setting id, which will be set automatically, other specific column is set by users.
+
+### Data Table
+Each Data is identified by its corresponding data name, and any different data will be assigned a different table for saving its different parameter setting, such as data-specific preprocess parameters, etc. The table is used to save different parameter for every data.
+
+The only necessary column for method table is the data setting id, which will be set automatically, other specific column is set by users.
+
+### Result Table
+Each Result Table is identified by its corresponding task name, and different tasks will be assigned with different tables for saving its different experiment results, such as accuracy for classification, normalized mutual information for clustering. 
+
+Besides, this table offers several columns for saving image in order for latter visualization. 
+
+The only necessary column for result table is the experiment id, other specific column is set by users.
+
+### Detail Table
+Each Detail Table is identified by its corresponding method name, different methods are related to different detail table. During an experiment, you may need to record some intermediate results, which can be saved in this table.
+
+The only necessary column for detail table is the detail id (which can be set automatically) and the experiment id, other specific column is set by users.
+
+
+# Future Plan
+
+- [ ] Some Scripts For Better Usage 
+- [ ] Experiment Summary Report Generate
+- [ ] Web UI Visualize & Commonly Used Analyze Fuctions
+
+# Contact
+My email is [email protected]. If you have any question or advice, please contact me. 
diff --git a/pyerm/dbbase.py b/pyerm/dbbase.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# Version: 0.1.0
+# Version: 0.1.1
 
 import sqlite3
 
@@ -72,6 +72,7 @@ def __init__(self, db:Database, table_name:str, columns:dict=None) -> None:
         assert table_name in db.table_names, f'Table {table_name} does not exist'
         self.db = db
         self.table_name = table_name
+        self._column = None
         if not table_exists(db, table_name):
             assert columns, 'Columns must be provided when creating a new table'
             columns_str = ', '.join([f'{key} {value}' for key, value in columns.items()])
@@ -80,13 +81,14 @@ def __init__(self, db:Database, table_name:str, columns:dict=None) -> None:
             self.db.table_names.append(table_name)
             print(f'Table {table_name} created')
         else:
+            assert columns.keys() == set([column[1] for column in self.db.cursor.execute(f'PRAGMA table_info({table_name})').fetchall()]), f'Columns do not match for table {table_name}, consider to check or change table name'
             print(f'Table {table_name} already exists')
         self.primary_key = [column[5] for column in self.db.cursor.execute(f'PRAGMA table_info({table_name})').fetchall() if column[5] == 1]
 
-    def insert(self, **kwargs) -> None:
+    def insert(self, **kwargs) -> int:
         assert len(kwargs) == len(self.columns) - 1 or len(kwargs) == len(self.columns), 'Parameter number does not match'
         columns = ', '.join(kwargs.keys())
-        values = ', '.join([f'"{value}"' if isinstance(value, str) else str(value) for value in kwargs.values()])
+        values = ', '.join([f'"{value}"' if isinstance(value, str) else str(value) if not isinstance(value, bool) else str(int(value)) for value in kwargs.values()])
         values.replace('None', 'NULL')
         self.db.cursor.execute(f'INSERT INTO {self.table_name} ({columns}) VALUES ({values})')
         self.db.conn.commit()
@@ -109,7 +111,9 @@ def select(self, *columns:str, where:str=None) -> list:
 
     @property
     def columns(self):
-        return [column[1] for column in self.db.cursor.execute(f'PRAGMA table_info({self.table_name})').fetchall()]
+        if self._column is None:
+            self._column = [column[1] for column in self.db.cursor.execute(f'PRAGMA table_info({self.table_name})').fetchall()]
+        return self._column
 
     def __len__(self):
         return len(self.select())
@@ -167,7 +171,7 @@ def __del__(self):
         self.db.conn.commit()
 
     def __str__(self) -> str:
-        return str([column for column in self.get_view_columns()]) + '\n' + \
+        return str([column for column in self.columns]) + '\n' + \
                 str(self.select())
 
 def table_exists(db:Database, table_name:str):

diff --git a/pyerm/experiment.py b/pyerm/experiment.py
@@ -20,14 +20,14 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# Version: 0.1.0
+# Version: 0.1.1
 
 import os
 import typing
 from PIL import Image
 
 from .dbbase import Database
-from .tables import ExperimentTable, ParameterTable, ResultTable, DetailTable, DataTable
+from .tables import ExperimentTable, MethodTable, ResultTable, DetailTable, DataTable
 
 __all__ = ['Experiment']
 
@@ -44,49 +44,78 @@ def __init__(self, db_path:str=None):
         self.detail_table = None
         self.data_table = None
 
-        self.id = None
-        self.data = None
-        self.data_id = None
-        self.method = None
-        self.method_id = None
-        self.task = None
-
-    def experiment_start(self, description:str, start_time:float=None, tags:str=None) -> int:
-        assert self.data is not None, 'Data not initialized, run data_init() first'
-        assert self.method is not None, 'Method not initialized, run method_init() first'
-        assert self.task is not None, 'Task not initialized, run task_init() first'
-        self.id = self.experiment_table.experiment_start(description, self.method, self.method_id, self.data, self.data_id, self.task, start_time, tags)
+        self._id = None
+        self._data = None
+        self._data_id = None
+        self._method = None
+        self._method_id = None
+        self._task = None
+
+    def experiment_start(self, description:str=None, start_time:float=None, tags:str=None, experimenters:str=None) -> int:
+        assert self._data is not None, 'Data not initialized, run data_init() first'
+        assert self._method is not None, 'Method not initialized, run method_init() first'
+        assert self._task is not None, 'Task not initialized, run task_init() first'
+        self._id = self.experiment_table.experiment_start(description, self._method, self._method_id, self._data, self._data_id, self._task, start_time, tags, experimenters)
 
-    def experiment_over(self, rst_dict:typing.Dict[str, typing.Any], images:typing.List[typing.Union[Image.Image, str]], end_time:float=None, useful_time_cost:float=None) -> None:
-        assert self.id is not None, 'Experiment not started, run experiment_start() first'
-        assert len(rst_dict) == len(self.rst_table.columns) - self.rst_table.max_image_num - 3, 'Result definition and result dict length mismatch'
-        self.rst_table.record_rst(experiment_id=self.id, **rst_dict)
-        self.rst_table.record_image(self.id, images)
+    def experiment_over(self, rst_dict:typing.Dict[str, typing.Union[int, float, str, bool, bytearray, bytes]], images:typing.List[typing.Union[Image.Image, str]], end_time:float=None, useful_time_cost:float=None) -> None:
+        assert self._id is not None, 'Experiment not started, run experiment_start() first'
+        assert len(rst_dict) == len(self.rst_table.columns) - self.rst_table.max_image_num - 1, 'Result definition and result dict length mismatch'
+        if self.rst_table is None:
+            rst_def_dict = auto_detect_def(rst_dict)
+            self.rst_table = ResultTable(self.db, self._task, rst_def_dict)
+        self.rst_table.record_rst(experiment_id=self._id, **rst_dict)
+        self.rst_table.record_image(self._id, images)
         self.experiment_table.experiment_over(self.id, end_time=end_time, useful_time_cost=useful_time_cost)
 
     def experiment_failed(self, end_time:float=None) -> None:
-        assert self.id is not None, 'Experiment not started, run experiment_start() first'
+        assert self._id is not None, 'Experiment not started, run experiment_start() first'
         self.experiment_table.experiment_failed(self.id, end_time=end_time)
 
-    def detail_update(self, detail_dict:typing.Dict[str, typing.Any]):
-        assert self.id is not None, 'Experiment not started, run experiment_start() first'
+    def detail_update(self, detail_dict:typing.Dict[str, typing.Union[int, float, str, bool, bytearray, bytes]]):
+        assert self._id is not None, 'Experiment not started, run experiment_start() first'
         assert len(detail_dict) == len(self.detail_table.columns) - 2, 'Detail definition and detail dict length mismatch'
-        self.detail_table.insert(experiment_id=self.id, **detail_dict)
-
-    def data_init(self, data_name:str, param_def_dict:typing.Dict[str, str], param_dict:typing.Dict[str, typing.Any]):
-        assert len(param_def_dict) == len(param_dict), 'Parameter definition and parameter dict length mismatch'
-        self.data = data_name
+        if self.detail_table is None:
+            detail_def_dict = auto_detect_def(detail_dict)
+            self.detail_table = DetailTable(self.db, self._method, detail_def_dict)
+        self.detail_table.insert(experiment_id=self._id, **detail_dict)
+
+    def data_init(self, data_name:str, param_dict:typing.Dict[str, typing.Union[int, float, str, bool, bytearray, bytes]], param_def_dict:typing.Dict[str, str]=None):
+        assert param_def_dict is None or len(param_def_dict) == len(param_dict), 'Parameter definition and parameter dict length mismatch'
+        self._data = data_name
+        if param_def_dict is None:
+            param_def_dict = auto_detect_def(param_dict)
         self.data_table = DataTable(self.db, data_name, param_def_dict)
-        self.data_id = self.data_table.insert(**param_dict)
+        self._data_id = self.data_table.insert(**param_dict)
 
-    def method_init(self, method_name:str, param_def_dict:typing.Dict[str, str], param_dict:typing.Dict[str, typing.Any], detail_def_dict:typing.Dict[str, str]):
-        assert len(param_def_dict) == len(param_dict), 'Parameter definition and parameter dict length mismatch'
-        self.method = method_name
-        self.method_table = ParameterTable(self.db, method_name, param_def_dict)
-        self.method_id = self.method_table.insert(**param_dict)
-        self.detail_table = DetailTable(self.db, method_name, detail_def_dict)
-
-    def task_init(self, task_name:str, rst_def_dict:typing.Dict[str, str], max_image_num:int=10):
-        self.task = task_name
-        self.rst_table = ResultTable(self.db, task_name, rst_def_dict, max_image_num)
-
+    def method_init(self, method_name:str, param_dict:typing.Dict[str, typing.Union[int, float, str, bool, bytearray, bytes]], param_def_dict:typing.Dict[str, str]=None, detail_def_dict:typing.Dict[str, str]=None):
+        assert param_dict is None or len(param_def_dict) == len(param_dict), 'Parameter definition and parameter dict length mismatch'
+        self._method = method_name
+        if param_def_dict is None:
+            param_def_dict = auto_detect_def(param_dict)
+        self.method_table = MethodTable(self.db, method_name, param_def_dict)
+        self._method_id = self.method_table.insert(**param_dict)
+        if detail_def_dict is not None:
+            self.detail_table = DetailTable(self.db, method_name, detail_def_dict)
+
+    def task_init(self, task_name:str, rst_def_dict:typing.Dict[str, str]=None, max_image_num:int=10):
+        self._task = task_name
+        if rst_def_dict is not None:
+            self.rst_table = ResultTable(self.db, task_name, rst_def_dict, max_image_num)
+
+
+def auto_detect_def(param_dict:typing.Dict[str, typing.Union[int, float, str, bool, bytearray, bytes]]) -> typing.Dict[str, str]:
+    param_def_dict = {}
+    for k, v in param_dict.items():
+        if isinstance(v, int):
+            param_def_dict[k] = 'INTEGER'
+        elif isinstance(v, float):
+            param_def_dict[k] = 'REAL'
+        elif isinstance(v, str):
+            param_def_dict[k] = 'TEXT'
+        elif isinstance(v, bool):
+            param_def_dict[k] = f'INTEGER CHECK({k} IN (0, 1))'
+        elif isinstance(v, bytes) or isinstance(v, bytearray):
+            param_def_dict[k] = 'BLOB'
+        else:
+            raise TypeError(f'Unsupported type for DB: {type(v)}')
+    return param_def_dict
diff --git a/pyerm/tables.py b/pyerm/tables.py
@@ -20,38 +20,41 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# Version: 0.1.0
+# Version: 0.1.1
 
 from PIL import Image
 from io import BytesIO
 from time import strftime, time
 import typing
+import traceback
 
 from .dbbase import Table, Database
 
 class ExperimentTable(Table):
     def __init__(self, db: Database) -> None:
         columns = {
             'id': 'INTEGER PRIMARY KEY AUTOINCREMENT',
-            'description': 'TEXT',
+            'description': 'TEXT DEFAULT NULL',
             'method': 'TEXT NOT NULL',
             'method_id': 'INTEGER NOT NULL', 
             'data': 'TEXT NOT NULL',
             'data_id': 'INTEGER NOT NULL',
             'task': 'TEXT NOT NULL',
             'tags': 'TEXT DEFAULT NULL', # 'tag1,tag2,tag3,...'
+            'experimenters': 'TEXT DEFAULT NULL', # 'experimenter1,experimenter2,experimenter3,...'
             'start_time': 'DATETIME',
             'end_time': 'DATETIME DEFAULT NULL',
             'useful_time_cost': 'REAL DEFAULT NULL',
             'total_time_cost': 'REAL AS (strftime(\"%s\", end_time) - strftime(\"%s\", start_time)) VIRTUAL',
             'status': 'TEXT CHECK(status IN (\"running\", \"finished\", \"failed\"))',
+            'failed_reason': 'TEXT DEFAULT NULL',
         }
         super().__init__(db, "experiment_list", columns)
 
-    def experiment_start(self, description:str, method:str, method_id:int, data:str, data_id, task:str, start_time:float=None, tags:str=None) -> int:
+    def experiment_start(self, description:str, method:str, method_id:int, data:str, data_id, task:str, start_time:float=None, tags:str=None, experimenters:str=None) -> int:
         if start_time is None:
             start_time = time()
-        return super().insert(description=description, method=method, method_id=method_id, data=data, data_id=data_id, task=task, tags=tags, start_time=strftime(start_time), status='running')
+        return super().insert(description=description, method=method, method_id=method_id, data=data, data_id=data_id, task=task, tags=tags, experimenters=experimenters, start_time=strftime(start_time), status='running')
 
     def experiment_over(self, experiment_id:int, end_time:float=None, useful_time_cost:float=None) -> None:
         if end_time is None:
@@ -61,7 +64,8 @@ def experiment_over(self, experiment_id:int, end_time:float=None, useful_time_co
     def experiment_failed(self, experiment_id:int, end_time:float=None) -> None:
         if end_time is None:
             end_time = time()
-        super().update(f"id={experiment_id}", end_time=strftime(end_time), status='failed')
+        error_info = traceback.format_exc()
+        super().update(f"id={experiment_id}", end_time=strftime(end_time), status='failed', failed_reason=error_info)
 
     def get_experiment(self, experiment_id:int) -> dict:
         return super().select(where=f"id={experiment_id}")[0]
@@ -82,13 +86,13 @@ def __init__(self, db: Database, data: str, param_def_dict: dict=None) -> None:
             }
         super().__init__(db, table_name, columns)
 
-class ParameterTable(Table):
+class MethodTable(Table):
     def __init__(self, db: Database, method: str, param_def_dict: dict=None) -> None:
-        table_name = f"parameter_\'{method}\'"
+        table_name = f"method_\'{method}\'"
         if table_name in self.db.table_names:
             columns = None
         else:
-            assert param_def_dict, 'Parameter Dict must be provided when creating a new parameter table'
+            assert param_def_dict, 'Method Parameter Dict must be provided when creating a new parameter table'
             columns = {
                 'method_id': 'INTEGER PRIMARY KEY',
                 **param_def_dict,
@@ -99,8 +103,6 @@ class ResultTable(Table):
     def __init__(self, db: Database, task: str, rst_def_dict: dict=None, max_image_num: int=10) -> None:
         columns = {
             'id': 'INTEGER PRIMARY KEY',
-            'method': 'TEXT',
-            'data': 'TEXT',
             **{f'image_{i}': 'BLBO DEFAULT NULL' for i in range(max_image_num)},
             **rst_def_dict,
         }

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='pyerm',
-    version='0.1.0',
+    version='0.1.1',
     packages=find_packages(),
     include_package_data=True,
 )