From 25ea334ec77d0539d43a8fe08f896b001df60379 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Tue, 3 Sep 2024 20:13:59 +0800
Subject: [PATCH 01/30] init cli

---
 cli/.clang-format                  |  33 ++++++
 cli/.gitignore                     | 144 ++++++++++++++++++++++++++
 cli/CMakeLists.txt                 |  28 +++++
 cli/README.md                      |  33 ++++++
 cli/conda.recipe/meta.yaml         |  29 ++++++
 cli/import.json                    | 124 +++++++++++++++++++++++
 cli/pyproject.toml                 |  56 ++++++++++
 cli/src/graphar_cli/__init__.py    |   5 +
 cli/src/graphar_cli/config.py      |  57 +++++++++++
 cli/src/graphar_cli/graphar_cli.py | 157 +++++++++++++++++++++++++++++
 cli/src/graphar_cli/importer.py    |  76 ++++++++++++++
 cli/src/graphar_cli/logging.py     |  23 +++++
 cli/src/main.cc                    |  88 ++++++++++++++++
 13 files changed, 853 insertions(+)
 create mode 100644 cli/.clang-format
 create mode 100644 cli/.gitignore
 create mode 100644 cli/CMakeLists.txt
 create mode 100644 cli/README.md
 create mode 100644 cli/conda.recipe/meta.yaml
 create mode 100644 cli/import.json
 create mode 100644 cli/pyproject.toml
 create mode 100644 cli/src/graphar_cli/__init__.py
 create mode 100644 cli/src/graphar_cli/config.py
 create mode 100644 cli/src/graphar_cli/graphar_cli.py
 create mode 100644 cli/src/graphar_cli/importer.py
 create mode 100644 cli/src/graphar_cli/logging.py
 create mode 100644 cli/src/main.cc

diff --git a/cli/.clang-format b/cli/.clang-format
new file mode 100644
index 000000000..233429c9f
--- /dev/null
+++ b/cli/.clang-format
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+BasedOnStyle: Google
+DerivePointerAlignment: false
+PointerAlignment: Left
+Cpp11BracedListStyle: true
+IndentCaseLabels: false
+AllowShortBlocksOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AllowShortIfStatementsOnASingleLine: false
+Standard: 'Cpp11'
+SpaceAfterCStyleCast: true
+AlignAfterOpenBracket: Align
+SortIncludes: true
+IncludeBlocks: Preserve
+ForEachMacros:
+  - BOOST_FOREACH
+
diff --git a/cli/.gitignore b/cli/.gitignore
new file mode 100644
index 000000000..ecba8a4bf
--- /dev/null
+++ b/cli/.gitignore
@@ -0,0 +1,144 @@
+# Using https://github.com/github/gitignore/blob/master/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/_generate/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+_skbuild/
+.pyodide-xbuildenv/
diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
new file mode 100644
index 000000000..35eeec6e0
--- /dev/null
+++ b/cli/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Require CMake 3.15+ (matching scikit-build-core) Use new versions of all
+# policies up to CMake 3.27
+cmake_minimum_required(VERSION 3.15...3.27)
+
+# Scikit-build-core sets these values for you, or you can just hard-code the
+# name and version.
+project(
+  ${SKBUILD_PROJECT_NAME}
+  VERSION ${SKBUILD_PROJECT_VERSION}
+  LANGUAGES CXX)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../cpp ${CMAKE_BINARY_DIR}/graphar)
+
+# Find the module development requirements (requires FindPython from 3.17 or
+# scikit-build-core's built-in backport)
+find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
+find_package(pybind11 CONFIG REQUIRED)
+
+# Add a library using FindPython's tooling (pybind11 also provides a helper like
+# this)
+python_add_library(_core MODULE src/main.cc WITH_SOABI)
+target_link_libraries(_core PRIVATE pybind11::headers graphar)
+
+# This is passing in the version as a define just as an example
+target_compile_definitions(_core PRIVATE VERSION_INFO=${PROJECT_VERSION})
+
+# The install directory is the output (wheel) directory
+install(TARGETS _core DESTINATION graphar_cli)
diff --git a/cli/README.md b/cli/README.md
new file mode 100644
index 000000000..eb4eda2cf
--- /dev/null
+++ b/cli/README.md
@@ -0,0 +1,33 @@
+# GraphAr Cli (Under Development)
+
+GraphAr Cli uses [pybind11][] and [scikit-build-core][] to bind C++ code into Python and build command line tools through Python. Command line tools developed using [typer][].
+
+[pybind11]: https://pybind11.readthedocs.io
+[scikit-build-core]: https://scikit-build-core.readthedocs.io
+[typer]: https://typer.tiangolo.com/
+
+## Requirements
+
+- Ubuntu 22.04
+- Cmake >= 3.15
+- Arrow >= 12.0
+- Python >= 3.7
+- pip == latest
+
+
+The best testing environment is `ghcr.io/apache/graphar-dev` Docker environment.
+
+And using Python in conda or venv is a good choice. 
+
+## Installation
+
+- Clone this repository
+- `pip install ./cli` or set verbose level `pip install -v ./cli`
+
+## Usage
+
+```bash
+graphar-cli --help
+```
+
+
diff --git a/cli/conda.recipe/meta.yaml b/cli/conda.recipe/meta.yaml
new file mode 100644
index 000000000..842a2bf72
--- /dev/null
+++ b/cli/conda.recipe/meta.yaml
@@ -0,0 +1,29 @@
+package:
+  name: graphar_cli
+  version: 0.0.1
+
+source:
+  path: ..
+
+build:
+  number: 0
+  script: {{ PYTHON }} -m pip install . -vv
+
+requirements:
+  build:
+    - python
+    - {{ compiler('cxx') }}
+
+  host:
+    - python
+    - pip
+    - scikit-build-core
+    - pybind11 >=2.10.0
+
+  run:
+    - python
+
+
+about:
+  summary: GraphAR Command Line
+  license_file: LICENSE
diff --git a/cli/import.json b/cli/import.json
new file mode 100644
index 000000000..66fbb0a0a
--- /dev/null
+++ b/cli/import.json
@@ -0,0 +1,124 @@
+{
+    "graphar": {
+        "path": "/tmp/graphar/movie",
+        "name": "MovieGraph",
+        "vertexChunkSize": 100,
+        "edgeChunkSize": 1024,
+        "fileType": "parquet",
+        "adjListType": "ordered_by_source"
+    },
+    "import_schema": {
+        "vertices": [
+            {
+                "type": "Person",
+                "labels": [
+                    "Person"
+                ],
+                "primary": "name",
+                "properties": [
+                    {
+                        "name": "name",
+                        "type": "string"
+                    },
+                    {
+                        "name": "born",
+                        "type": "int64",
+                        "nullable": true
+                    }
+                ],
+                "propertyGroups": [
+                    [
+                        "name",
+                        "born"
+                    ]
+                ],
+                "source": {
+                    "type": "parquet",
+                    "path": "/tmp/graphar/movie/Person.parquet",
+                    "columns": {
+                        "name": "name",
+                        "born": "born"
+                    }
+                }
+            },
+            {
+                "type": "Movie",
+                "labels": [
+                    "Movie"
+                ],
+                "primary": "title",
+                "properties": [
+                    {
+                        "name": "title",
+                        "type": "string"
+                    },
+                    {
+                        "name": "tagline",
+                        "type": "string",
+                        "nullable": true
+                    }
+                ],
+                "propertyGroups": [
+                    [
+                        "title",
+                        "tagline"
+                    ]
+                ],
+                "source": {
+                    "type": "parquet",
+                    "path": "/tmp/graphar/movie/Movie.csv",
+                    "delimiter": ",",
+                    "columns": {
+                        "title": "title",
+                        "tagline": "tagline"
+                    }
+                }
+            }
+        ],
+        "edges": [
+            {
+                "type": "PRODUCED",
+                "adjListType": "ordered_by_dest",
+                "srcType": "Person",
+                "srcProp": "name",
+                "dstType": "Movie",
+                "dstProp": "title",
+                "source": {
+                    "type": "parquet",
+                    "path": "/tmp/graphar/movie/Produced.csv",
+                    "columns": {
+                        "srcIndex": "_graphArSrcIndex",
+                        "dstIndex": "_graphArDstIndex"
+                    }
+                }
+            },
+            {
+                "type": "REVIEWED",
+                "srcType": "Person",
+                "srcProp": "name",
+                "dstType": "Movie",
+                "dstProp": "title",
+                "properties": [
+                    {
+                        "name": "rating",
+                        "type": "string"
+                    },
+                    {
+                        "name": "summary",
+                        "type": "string"
+                    }
+                ],
+                "source": {
+                    "type": "parquet",
+                    "path": "/tmp/graphar/movie/Produced.csv",
+                    "columns": {
+                        "srcIndex": "_graphArSrcIndex",
+                        "dstIndex": "_graphArDstIndex",
+                        "rating": "rating",
+                        "summary": "summary"
+                    }
+                }
+            }
+        ]
+    }
+}
\ No newline at end of file
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
new file mode 100644
index 000000000..b1c6a3e0f
--- /dev/null
+++ b/cli/pyproject.toml
@@ -0,0 +1,56 @@
+[build-system]
+requires = ["scikit-build-core>=0.3.3", "pybind11"]
+build-backend = "scikit_build_core.build"
+
+
+[project]
+name = "graphar_cli"
+version = "0.0.1"
+description = "GraphAr command line tool"
+readme = "README.md"
+authors = [{ name = "GraphAr community", email = "dev@graphar.apache.org" }]
+requires-python = ">=3.7"
+dependencies = ["typer ~= 0.12.3", "pandas ~= 2.2.2", "pydantic ~= 2.8.2"]
+
+
+[project.scripts]
+graphar-cli = "graphar_cli.graphar_cli:main"
+
+
+[tool.scikit-build]
+build-dir = "build"
+
+
+[tool.ruff]
+src = ["src"]
+line-length = 100
+
+
+[tool.ruff.lint]
+extend-select = [
+  "B",   # flake8-bugbear
+  "I",   # isort
+  "ARG", # flake8-unused-arguments
+  "C4",  # flake8-comprehensions
+  "EM",  # flake8-errmsg
+  "ICN", # flake8-import-conventions
+  "G",   # flake8-logging-format
+  "PGH", # pygrep-hooks
+  "PIE", # flake8-pie
+  "PL",  # pylint
+  "PT",  # flake8-pytest-style
+  "PTH", # flake8-use-pathlib
+  "RET", # flake8-return
+  "RUF", # Ruff-specific
+  "SIM", # flake8-simplify
+  "T20", # flake8-print
+  "UP",  # pyupgrade
+  "YTT", # flake8-2020
+  "EXE", # flake8-executable
+  "NPY", # NumPy specific rules
+  "PD",  # pandas-vet
+]
+ignore = [
+  "PLR09",   # Too many X
+  "PLR2004", # Magic comparison
+]
diff --git a/cli/src/graphar_cli/__init__.py b/cli/src/graphar_cli/__init__.py
new file mode 100644
index 000000000..4e7badc5c
--- /dev/null
+++ b/cli/src/graphar_cli/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from ._core import __doc__, __version__
+
+__all__ = ["__doc__", "__version__"]
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
new file mode 100644
index 000000000..5f74c7371
--- /dev/null
+++ b/cli/src/graphar_cli/config.py
@@ -0,0 +1,57 @@
+from typing import Dict, List, Literal, Optional
+
+from pydantic import BaseModel
+
+
+class GraphArConfig(BaseModel):
+    path: str
+    name: str
+    vertexChunkSize: int = 100
+    edgeChunkSize: int = 1024
+    fileType: Literal["parquet", "orc", "csv", "json"] = "parquet"
+    adjListType: Literal[
+        "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
+    ] = "ordered_by_source"
+
+
+class Property(BaseModel):
+    name: str
+    type: Literal["bool", "int32", "int64", "float", "double", "string", "date", "timestamp"]
+    nullable: bool = False
+
+
+class Source(BaseModel):
+    type: Literal["parquet", "orc", "csv", "json"] = "csv"
+    path: str
+    delimiter: str = ","
+    columns: Dict[str, str]
+
+
+class Vertex(BaseModel):
+    type: str
+    labels: List[str]
+    primary: str
+    properties: List[Property]
+    propertyGroups: List[List[str]] = []
+    source: Source
+
+
+class Edge(BaseModel):
+    type: str
+    srcType: str
+    srcProp: str
+    dstType: str
+    dstProp: str
+    properties: List[Property] = []
+    propertyGroups: List[List[str]] = []
+    source: Source
+
+
+class SchemaConfig(BaseModel):
+    vertices: List[Vertex]
+    edges: List[Edge]
+
+
+class ImportConfig(BaseModel):
+    graphar: GraphArConfig
+    import_schema: SchemaConfig
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
new file mode 100644
index 000000000..72b797eed
--- /dev/null
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -0,0 +1,157 @@
+import json
+from logging import getLogger
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+import typer
+
+from ._core import (
+    check_edge,
+    check_graph,
+    check_vertex,
+    get_edge_types,
+    get_vertex_types,
+    show_edge,
+    show_graph,
+    show_vertex,
+)
+from .config import ImportConfig
+from .importer import do_import, validate
+from .logging import setup_logging
+
+app = typer.Typer(help="GraphAr Cli", no_args_is_help=True, add_completion=False)
+
+setup_logging()
+logger = getLogger(__name__)
+
+
+@app.command(context_settings={"help_option_names": ["-h", "--help"]}, help="Show the metadata")
+def show(
+    path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
+    vertex: str = typer.Option(None, "--vertex", "-v", help="Vertex type to show"),
+    edge_src: str = typer.Option(None, "--edge-src", "-es", help="Source of the edge type to show"),
+    edge: str = typer.Option(None, "--edge", "-e", help="Edge type to show"),
+    edge_dst: str = typer.Option(
+        None, "--edge-dst", "-ed", help="Destination of the edge type to show"
+    ),
+) -> None:
+    if not Path(path).exists():
+        logger.error("File not found: %s", path)
+        raise typer.Exit(1)
+    path = Path(path).resolve() if Path(path).is_absolute() else Path(Path.cwd(), path).resolve()
+    path = str(path)
+    if vertex:
+        vertex_types = get_vertex_types(path)
+        if vertex not in vertex_types:
+            logger.error("Vertex %s not found in the graph", vertex)
+            raise typer.Exit(1)
+        logger.info(show_vertex(path, vertex))
+        raise typer.Exit(0)
+    if edge or edge_src or edge_dst:
+        if not (edge and edge_src and edge_dst):
+            logger.error("Edge source, edge, and edge destination must all be set")
+            raise typer.Exit(1)
+        edge_types = get_edge_types(path)
+        found = False
+        for edge_type in edge_types:
+            if edge_type[0] == edge_src and edge_type[1] == edge and edge_type[2] == edge_dst:
+                found = True
+                break
+        if not found:
+            logger.error(
+                "Edge type with source %s, edge %s, and destination %s not found in the graph",
+                edge_src,
+                edge,
+                edge_dst,
+            )
+            raise typer.Exit(1)
+        logger.info(show_edge(path, edge))
+        raise typer.Exit(1)
+    logger.info(show_graph(path))
+
+
+@app.command(context_settings={"help_option_names": ["-h", "--help"]}, help="Check the metadata")
+def check(
+    path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
+):
+    if not Path(path).exists():
+        logger.error("File not found: %s", path)
+        raise typer.Exit(1)
+    path = Path(path).resolve() if Path(path).is_absolute() else Path(Path.cwd(), path).resolve()
+    path = str(path)
+    vertex_types = get_vertex_types(path)
+    for vertex_type in vertex_types:
+        if not check_vertex(path, vertex_type):
+            logger.error("Vertex type %s is not valid", vertex_type)
+            raise typer.Exit(1)
+    edge_types = get_edge_types(path)
+    for edge_type in edge_types:
+        if edge_type[0] not in vertex_types:
+            logger.error("Source vertex type %s not found in the graph", edge_type[0])
+            raise typer.Exit(1)
+        if edge_type[2] not in vertex_types:
+            logger.error("Destination vertex type %s not found in the graph", edge_type[2])
+            raise typer.Exit(1)
+        if not check_edge(path, edge_type[0], edge_type[1], edge_type[2]):
+            logger.error(
+                "Edge type %s_%s_%s is not valid", edge_type[0], edge_type[1], edge_type[2]
+            )
+            raise typer.Exit(1)
+    if not check_graph(path):
+        logger.error("Graph is not valid")
+        raise typer.Exit(1)
+    logger.info("Graph is valid")
+
+
+@app.command("import", context_settings={"help_option_names": ["-h", "--help"]}, help="Import data")
+def import_data(
+    config_file: str = typer.Option(None, "--config", "-c", help="Path to the GraphAr config file"),
+):
+    if not Path(config_file).exists():
+        logger.error("File not found: %s", config_file)
+        raise typer.Exit(1)
+    with Path(config_file).open(encoding="utf-8") as file:
+        config = json.load(file)
+    try:
+        import_config = ImportConfig(**config)
+        validate(import_config)
+    except Exception as e:
+        logger.error("Invalid config: %s", e)
+        raise typer.Exit(1) from None
+    try:
+        do_import(import_config)
+    except Exception as e:
+        logger.error("Import failed: %s", e)
+        raise typer.Exit(1) from None
+    logger.info(config)
+
+
+@app.command(
+    "merge", context_settings={"help_option_names": ["-h", "--help"]}, help="Merge source files"
+)
+def merge_data(
+    files: List[str] = typer.Option(None, "--file", "-f", help="Files to merge"),  # noqa: B008
+    type: str = typer.Option(None, "--type", "-t", help="Type of data to merge"),
+    output: str = typer.Option(None, "--output", "-o", help="Output file"),
+):
+    if type == "parquet":
+        data = pd.concat([pd.read_parquet(file) for file in files], axis=1)
+        data.to_parquet(output)
+    elif type == "csv":
+        data = pd.concat([pd.read_csv(file) for file in files], axis=1)
+        data.to_csv(output)
+    elif type == "orc":
+        data = pd.concat([pd.read_orc(file) for file in files], axis=1)
+        data.to_orc(output)
+    elif type == "json":
+        data = pd.concat([pd.read_json(file) for file in files], axis=1)
+        data.to_json(output)
+    else:
+        logger.error("Type %s not supported", type)
+        raise typer.Exit(1)
+    logger.info("Merged data saved to %s", output)
+
+
+def main() -> None:
+    app()
diff --git a/cli/src/graphar_cli/importer.py b/cli/src/graphar_cli/importer.py
new file mode 100644
index 000000000..66f603d05
--- /dev/null
+++ b/cli/src/graphar_cli/importer.py
@@ -0,0 +1,76 @@
+from .config import ImportConfig
+
+
+def validate(config: ImportConfig):
+    vertex_types = set()
+    for vertex in config.import_schema.vertices:
+        if vertex.type in vertex_types:
+            msg = f"Duplicate vertex type {vertex.type}"
+            raise ValueError(msg)
+        vertex_types.add(vertex.type)
+
+        prop_names = set()
+        for prop in vertex.properties:
+            if prop.name in prop_names:
+                msg = f"Duplicate property {prop.name} in vertex properties"
+                raise ValueError(msg)
+            prop_names.add(prop.name)
+            if prop.name == vertex.primary and prop.nullable:
+                msg = f"Primary key {vertex.primary} cannot be nullable"
+                raise ValueError(msg)
+        if vertex.primary not in prop_names:
+            msg = f"Primary key {vertex.primary} not found in vertex properties"
+            raise ValueError(msg)
+        prop_names_in_groups = set()
+        for prop_group in vertex.propertyGroups:
+            for prop_name in prop_group:
+                if prop_name not in prop_names:
+                    msg = f"Property {prop} not found in vertex properties"
+                    raise ValueError(msg)
+                if prop_name in prop_names_in_groups:
+                    msg = f"Property {prop} found in multiple property groups"
+                    raise ValueError(msg)
+                prop_names_in_groups.add(prop_name)
+        for prop_name in prop_names:
+            if prop_name not in vertex.source.columns:
+                msg = f"Property {prop_name} not found in source columns"
+                raise ValueError(msg)
+        
+    edge_types = set()
+    for edge in config.import_schema.edges:
+        if edge.type in edge_types:
+            msg = f"Duplicate edge type {edge.type}"
+            raise ValueError(msg)
+        edge_types.add(edge.type)
+
+        if edge.srcType not in vertex_types:
+            msg = f"Source vertex type {edge.srcType} not found"
+            raise ValueError(msg)
+        if edge.dstType not in vertex_types:
+            msg = f"Destination vertex type {edge.dstType} not found"
+            raise ValueError(msg)
+        # TODO: Check that source and destination properties are present in the source vertex
+        prop_names = set()
+        for prop in edge.properties:
+            if prop.name in prop_names:
+                msg = f"Duplicate property {prop.name} in edge properties"
+                raise ValueError(msg)
+            prop_names.add(prop.name)
+        prop_names_in_groups = set()
+        for prop_group in edge.propertyGroups:
+            for prop_name in prop_group:
+                if prop_name not in prop_names:
+                    msg = f"Property {prop} not found in edge properties"
+                    raise ValueError(msg)
+                if prop_name in prop_names_in_groups:
+                    msg = f"Property {prop} found in multiple property groups"
+                    raise ValueError(msg)
+                prop_names_in_groups.add(prop_name)
+        for prop_name in prop_names:
+            if prop_name not in edge.source.columns:
+                msg = f"Property {prop_name} not found in source columns"
+                raise ValueError(msg)
+        
+
+def do_import(config: ImportConfig):
+    pass
\ No newline at end of file
diff --git a/cli/src/graphar_cli/logging.py b/cli/src/graphar_cli/logging.py
new file mode 100644
index 000000000..3a2b90684
--- /dev/null
+++ b/cli/src/graphar_cli/logging.py
@@ -0,0 +1,23 @@
+import logging
+from typing import Union
+
+from rich.console import Console
+from rich.logging import RichHandler
+
+
+def setup_logging(terminal_width: Union[int, None] = None) -> None:
+    logger = logging.getLogger("graphar_cli")
+    console = Console(width=terminal_width) if terminal_width else None
+    rich_handler = RichHandler(
+        show_time=False,
+        rich_tracebacks=True,
+        tracebacks_show_locals=True,
+        markup=True,
+        show_path=False,
+        console=console,
+    )
+    rich_handler.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(rich_handler)
+
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
diff --git a/cli/src/main.cc b/cli/src/main.cc
new file mode 100644
index 000000000..3387dca46
--- /dev/null
+++ b/cli/src/main.cc
@@ -0,0 +1,88 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <graphar/graph_info.h>
+
+#define STRINGIFY(x) #x
+#define MACRO_STRINGIFY(x) STRINGIFY(x)
+
+std::string ShowGraph(const std::string& path) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  return graph_info->Dump().value();
+}
+
+std::string ShowVertex(const std::string& path,
+                       const std::string& vertex_type) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto vertex_info = graph_info->GetVertexInfo(vertex_type);
+  return vertex_info->Dump().value();
+}
+
+std::string ShowEdge(const std::string& path, const std::string& src_type,
+                     const std::string& edge_type,
+                     const std::string& dst_type) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto edge_info = graph_info->GetEdgeInfo(src_type, edge_type, dst_type);
+  return edge_info->Dump().value();
+}
+
+bool CheckGraph(const std::string& path) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  return graph_info->IsValidated();
+}
+
+bool CheckVertex(const std::string& path, const std::string& vertex_type) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto vertex_info = graph_info->GetVertexInfo(vertex_type);
+  return vertex_info->IsValidated();
+}
+
+bool CheckEdge(const std::string& path, const std::string& src_type,
+               const std::string& edge_type, const std::string& dst_type) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto edge_info = graph_info->GetEdgeInfo(src_type, edge_type, dst_type);
+  return edge_info->IsValidated();
+}
+
+std::vector<std::string> GetVertexTypes(const std::string& path) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto vertex_infos = graph_info->GetVertexInfos();
+  std::vector<std::string> vertex_types;
+  for (const auto& vertex_info : vertex_infos) {
+    vertex_types.push_back(vertex_info->GetType());
+  }
+  return vertex_types;
+}
+
+std::vector<std::vector<std::string>> GetEdgeTypes(const std::string& path) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto edge_infos = graph_info->GetEdgeInfos();
+  std::vector<std::vector<std::string>> edge_types;
+  for (const auto& edge_info : edge_infos) {
+    std::vector<std::string> edge_type;
+    edge_type.push_back(edge_info->GetEdgeType());
+    edge_type.push_back(edge_info->GetSrcType());
+    edge_type.push_back(edge_info->GetDstType());
+    edge_types.push_back(edge_type);
+  }
+  return edge_types;
+}
+
+namespace py = pybind11;
+PYBIND11_MODULE(_core, m) {
+  m.doc() = "GraphAr Python bindings";
+  m.def("show_graph", &ShowGraph, "Show the graph info");
+  m.def("show_vertex", &ShowVertex, "Show the vertex info");
+  m.def("show_edge", &ShowEdge, "Show the edge info");
+  m.def("check_graph", &CheckGraph, "Check the graph info");
+  m.def("check_vertex", &CheckVertex, "Check the vertex info");
+  m.def("check_edge", &CheckEdge, "Check the edge info");
+  m.def("get_vertex_types", &GetVertexTypes, "Get the vertex types");
+  m.def("get_edge_types", &GetEdgeTypes, "Get the edge types");
+
+#ifdef VERSION_INFO
+  m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO);
+#else
+  m.attr("__version__") = "dev";
+#endif
+}

From a8da3cecbd36f15a981a88983c5b606c22ff560b Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Thu, 5 Sep 2024 15:13:21 +0800
Subject: [PATCH 02/30] fix include

---
 cli/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 35eeec6e0..755558819 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -20,6 +20,7 @@ find_package(pybind11 CONFIG REQUIRED)
 # this)
 python_add_library(_core MODULE src/main.cc WITH_SOABI)
 target_link_libraries(_core PRIVATE pybind11::headers graphar)
+target_link_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
 
 # This is passing in the version as a define just as an example
 target_compile_definitions(_core PRIVATE VERSION_INFO=${PROJECT_VERSION})

From 54ad9244380081152774b3c1938c1407340a41d2 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 18 Oct 2024 19:03:49 +0800
Subject: [PATCH 03/30] add  ci

---
 .github/workflows/cli.yml | 104 ++++++++++++++++++++++++++++++++++++++
 cli/CMakeLists.txt        |   6 ++-
 cli/pyproject.toml        |   1 -
 3 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/cli.yml

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
new file mode 100644
index 000000000..0de3eeb44
--- /dev/null
+++ b/.github/workflows/cli.yml
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: GraphAr CLI CI
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - 'cpp/**'
+      - 'cli/**'
+      - '.github/workflows/ci.yml'
+      - '.github/workflows/cli.yml'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'cpp/**'
+      - 'cli/**'
+      - '.github/workflows/ci.yml'
+      - '.github/workflows/cli.yml'
+concurrency:
+  group: ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu:
+    name: Ubuntu 22.04 CLI
+    runs-on: ubuntu-latest
+    if: ${{ !contains(github.event.pull_request.title, 'WIP') && !github.event.pull_request.draft }}
+    steps:
+    - uses: actions/checkout@v3
+      with:
+          submodules: true
+
+    - name: Install dependencies
+      run: |
+
+        # install the latest arrow deb to test arrow
+        wget -c https://apache.jfrog.io/artifactory/arrow/"$(lsb_release --id --short | tr 'A-Z' 'a-z')"/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
+            -P /tmp/
+        sudo apt-get install -y /tmp/apache-arrow-apt-source-latest-"$(lsb_release --codename --short)".deb
+        sudo apt-get update -y
+        sudo apt install -y libarrow-dev \
+                            libarrow-dataset-dev \
+                            libarrow-acero-dev \
+                            libparquet-dev
+        sudo apt-get install -y ccache libcurl4-openssl-dev
+
+    - name: Install GraphAr CLI
+      working-directory: "cli"
+      run: |
+        pip install ./ -vvv
+        graphar-cli --help
+
+    - name: Upload coverage reports to Codecov
+      uses: codecov/codecov-action@v4.0.1
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+
+  macos:
+    name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} CLI
+    runs-on: macos-${{ matrix.macos-version }}
+    if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.event.pull_request.draft == false }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - architecture: AMD64
+            macos-version: "12"
+          - architecture: ARM64
+            macos-version: "14"
+    steps:
+    - uses: actions/checkout@v3
+      with:
+          submodules: true
+
+    - name: Install dependencies
+      run: |
+        brew bundle --file=cpp/Brewfile
+        
+    
+    - name: Build GraphAr
+      working-directory: "cli"
+      run: |
+        pip install ./ -vvv
+        graphar-cli --help
diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 755558819..8f01ae809 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -20,10 +20,14 @@ find_package(pybind11 CONFIG REQUIRED)
 # this)
 python_add_library(_core MODULE src/main.cc WITH_SOABI)
 target_link_libraries(_core PRIVATE pybind11::headers graphar)
-target_link_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
+target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
+target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/thirdparty)
 
 # This is passing in the version as a define just as an example
 target_compile_definitions(_core PRIVATE VERSION_INFO=${PROJECT_VERSION})
 
 # The install directory is the output (wheel) directory
+set_target_properties(_core PROPERTIES INSTALL_RPATH "$ORIGIN")
+install(TARGETS graphar DESTINATION graphar_cli)
 install(TARGETS _core DESTINATION graphar_cli)
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index b1c6a3e0f..ec59f00c7 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -20,7 +20,6 @@ graphar-cli = "graphar_cli.graphar_cli:main"
 [tool.scikit-build]
 build-dir = "build"
 
-
 [tool.ruff]
 src = ["src"]
 line-length = 100

From 2b07a69454f18abfce360308be54f0e2c06e11cc Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Sun, 20 Oct 2024 03:37:02 +0800
Subject: [PATCH 04/30] add vertex info

---
 cli/import.full.yml                | 139 ++++++++++++++++++
 cli/pyproject.toml                 |   7 +-
 cli/src/graphar_cli/config.py      | 204 +++++++++++++++++++++++----
 cli/src/graphar_cli/graphar_cli.py |  80 ++++++++---
 cli/src/importer.h                 | 217 +++++++++++++++++++++++++++++
 cli/src/main.cc                    |  67 ++++++++-
 6 files changed, 666 insertions(+), 48 deletions(-)
 create mode 100644 cli/import.full.yml
 create mode 100644 cli/src/importer.h

diff --git a/cli/import.full.yml b/cli/import.full.yml
new file mode 100644
index 000000000..cacdeed6d
--- /dev/null
+++ b/cli/import.full.yml
@@ -0,0 +1,139 @@
+graphar:
+  path: /workspaces/incubator-graphar/graphar/movie
+  name: MovieGraph
+  vertex_chunk_size: 100
+  edge_chunk_size: 1024
+  file_type: parquet
+  adj_list_type: ordered_by_source
+  source_file_type: parquet
+  version: gar/v1
+
+import_schema:
+  vertices:
+    - type: Person
+      labels:
+        - Person
+      chunk_size: 100
+      prefix: vertex/Person/
+      property_groups:
+        - file_type: parquet
+          properties:
+            - name: name
+              data_type: string
+              is_primary: true
+              nullable: false
+            - name: born
+              data_type: int64
+              is_primary: false
+              nullable: true
+      sources:
+        - file_type: parquet
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Person.parquet
+          columns:
+            name: name
+            born: born
+    - type: Movie
+      labels:
+        - Movie
+      chunk_size: 100
+      prefix: vertex/Movie/
+      property_groups:
+        - file_type: parquet
+          properties:
+            - name: title
+              data_type: string
+              is_primary: true
+              nullable: false
+            - name: tagline
+              data_type: string
+              is_primary: false
+              nullable: true
+      sources:
+        - file_type: parquet
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.parquet
+          columns:
+            title: title
+            tagline: tagline
+  edges:
+    - edge_type: WROTE
+      src_type: Person
+      src_prop: name
+      dst_type: Movie
+      dst_prop: title
+      labels:
+        - WROTE
+      chunk_size: 1024
+      prefix: edge/Person_WROTE_Movie/
+      adj_lists:
+        - ordered: true
+          aligned_by: src
+          file_type: parquet
+        - ordered: true
+          aligned_by: dst
+          file_type: parquet
+      sources:
+        - file_type: csv
+          delimiter: ","
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv
+          columns:
+            srcIndex: _graphArSrcIndex
+            dstIndex: _graphArDstIndex
+    - edge_type: ACTED_IN
+      src_type: Person
+      src_prop: name
+      dst_type: Movie
+      dst_prop: title
+      labels:
+        - ACTED_IN
+      chunk_size: 1024
+      prefix: edge/Person_ACTED_IN_Movie/
+      adj_lists:
+        - ordered: true
+          aligned_by: src
+          file_type: parquet
+        - ordered: true
+          aligned_by: dst
+          file_type: parquet
+      sources:
+        - file_type: csv
+          delimiter: ","
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.csv
+          columns:
+            srcIndex: _graphArSrcIndex
+            dstIndex: _graphArDstIndex
+    - edge_type: REVIEWED
+      src_type: Person
+      src_prop: name
+      dst_type: Movie
+      dst_prop: title
+      labels:
+        - Review
+      chunk_size: 1024
+      prefix: edge/Person_REVIEWED_Movie/
+      property_groups:
+        - file_type: parquet
+          properties:
+            - name: rating
+              data_type: int64
+              is_primary: false
+              nullable: true
+            - name: summary
+              data_type: string
+              is_primary: false
+              nullable: true
+      adj_lists:
+        - ordered: true
+          aligned_by: src
+          file_type: parquet
+        - ordered: true
+          aligned_by: dst
+          file_type: parquet
+      sources:
+        - file_type: csv
+          delimiter: ","
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv
+          columns:
+            name: name
+            title: title
+            summary: summary
+            rating: rating
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index ec59f00c7..2802fbfbd 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -10,7 +10,12 @@ description = "GraphAr command line tool"
 readme = "README.md"
 authors = [{ name = "GraphAr community", email = "dev@graphar.apache.org" }]
 requires-python = ">=3.7"
-dependencies = ["typer ~= 0.12.3", "pandas ~= 2.2.2", "pydantic ~= 2.8.2"]
+dependencies = [
+  "typer ~= 0.12.3",
+  "pandas ~= 2.2.2",
+  "pydantic ~= 2.8.2",
+  "pyyaml ~= 6.0.2",
+]
 
 
 [project.scripts]
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index 5f74c7371..6e48eb8df 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -1,57 +1,211 @@
-from typing import Dict, List, Literal, Optional
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, field_validator, model_validator
+from typing_extensions import Self
+
+default_file_type = "parquet"
+default_source_file_type = "csv"
+default_adj_list_type = "ordered_by_source"
+default_regular_separator = "_"
+default_version = "gar/v1"
 
 
 class GraphArConfig(BaseModel):
     path: str
     name: str
-    vertexChunkSize: int = 100
-    edgeChunkSize: int = 1024
-    fileType: Literal["parquet", "orc", "csv", "json"] = "parquet"
-    adjListType: Literal[
+    vertex_chunk_size: Optional[int] = 100
+    edge_chunk_size: Optional[int] = 1024
+    file_type: Literal["parquet", "orc", "csv", "json"] = default_file_type
+    adj_list_type: Literal[
         "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
-    ] = "ordered_by_source"
+    ] = default_adj_list_type
+    source_file_type: Literal["parquet", "orc", "csv", "json"] = default_source_file_type
+    version: Optional[str] = default_version
 
 
 class Property(BaseModel):
     name: str
-    type: Literal["bool", "int32", "int64", "float", "double", "string", "date", "timestamp"]
-    nullable: bool = False
+    data_type: Literal["bool", "int32", "int64", "float", "double", "string", "date", "timestamp"]
+    is_primary: bool = False
+    nullable: Optional[bool] = None
+
+    @model_validator(mode="after")
+    def check_nullable(self) -> Self:
+        if self.is_primary and self.nullable:
+            msg = f"Primary key `{self.name}` must not be nullable."
+            raise ValueError(msg)
+        if self.is_primary:
+            self.nullable = False
+        elif self.nullable is None:
+                self.nullable = True
+        return self
+
+
+class PropertyGroup(BaseModel):
+    properties: List[Property]
+    file_type: Optional[Literal["parquet", "orc", "csv", "json"]] = None
+
+    @field_validator("properties")
+    def check_properties_length(cls, v):
+        if len(v) == 0:
+            msg = "properties must not be empty."
+            raise ValueError(msg)
+        return v
 
 
 class Source(BaseModel):
-    type: Literal["parquet", "orc", "csv", "json"] = "csv"
+    file_type: Optional[Literal["parquet", "orc", "csv", "json"]] = None
     path: str
     delimiter: str = ","
     columns: Dict[str, str]
 
+    @field_validator("path")
+    def check_path(cls, v):
+        if not Path(v).is_file():
+            msg = f"{v} is not a file."
+            raise ValueError(msg)
+        return v
+
+    @field_validator("delimiter")
+    def check_delimiter(cls, v):
+        if len(v) != 1:
+            msg = "delimiter must be a single character."
+            raise ValueError(msg)
+        return v
+
 
 class Vertex(BaseModel):
     type: str
-    labels: List[str]
-    primary: str
-    properties: List[Property]
-    propertyGroups: List[List[str]] = []
-    source: Source
+    labels: List[str] = []
+    chunk_size: Optional[int] = None
+    prefix: Optional[str] = None
+    property_groups: List[PropertyGroup]
+    sources: List[Source]
+
+    @field_validator("property_groups")
+    def check_property_groups_length(cls, v):
+        if len(v) == 0:
+            msg = "property_groups must not be empty."
+            raise ValueError(msg)
+        return v
+
+    @field_validator("sources")
+    def check_sources_length(cls, v):
+        if len(v) == 0:
+            msg = "sources must not be empty."
+            raise ValueError(msg)
+        return v
+
+    @model_validator(mode="after")
+    def check_vertex_prefix(self) -> Self:
+        prefix = self.prefix
+        type = self.type
+        if not prefix:
+            self.prefix = f"vertex/{type}/"
+        return self
+
+
+class AdjList(BaseModel):
+    ordered: bool
+    aligned_by: Literal["src", "dst"]
+    file_type: Literal["parquet", "orc", "csv", "json"]
 
 
 class Edge(BaseModel):
-    type: str
-    srcType: str
-    srcProp: str
-    dstType: str
-    dstProp: str
-    properties: List[Property] = []
-    propertyGroups: List[List[str]] = []
-    source: Source
+    edge_type: str
+    src_type: str
+    src_prop: Optional[str] = None
+    dst_type: str
+    dst_prop: Optional[str] = None
+    labels: List[str] = []
+    chunk_size: Optional[int] = None
+    adj_lists: List[AdjList] = []
+    property_groups: List[PropertyGroup] = []
+    sources: List[Source]
+    prefix: Optional[str] = None
 
+    @field_validator("sources")
+    def check_sources_length(cls, v):
+        if len(v) == 0:
+            msg = "sources must not be empty."
+            raise ValueError(msg)
+        return v
 
-class SchemaConfig(BaseModel):
+    @model_validator(mode="after")
+    def check_prefix(self) -> Self:
+        prefix = self.prefix
+        src_type = self.src_type
+        edge_type = self.edge_type
+        dst_type = self.dst_type
+        if not prefix:
+            self.prefix = (
+                f"edge/{src_type}"
+                f"{default_regular_separator}{edge_type}"
+                f"{default_regular_separator}{dst_type}/"
+            )
+        return self
+
+
+class ImportSchema(BaseModel):
     vertices: List[Vertex]
     edges: List[Edge]
 
+    @field_validator("vertices")
+    def check_property_groups_length(cls, v):
+        if len(v) == 0:
+            msg = "vertices must not be empty."
+            raise ValueError(msg)
+        return v
+
 
 class ImportConfig(BaseModel):
     graphar: GraphArConfig
-    import_schema: SchemaConfig
+    import_schema: ImportSchema
+
+    @field_validator("import_schema")
+    def replace_none_values(cls, v: ImportSchema, values: Dict[str, Any]):
+        config = values.data["graphar"]
+        for vertex in v.vertices:
+            if vertex.chunk_size is None:
+                vertex.chunk_size = config.vertex_chunk_size
+            for property_group in vertex.property_groups:
+                if property_group.file_type is None:
+                    property_group.file_type = config.file_type
+            for source in vertex.sources:
+                if source.file_type is None:
+                    source.file_type = config.source_file_type
+        for edge in v.edges:
+            if edge.chunk_size is None:
+                edge.chunk_size = config.edge_chunk_size
+            if len(edge.adj_lists) == 0:
+                if config.adj_list_type == "ordered_by_source":
+                    edge.adj_lists.append(
+                        AdjList(ordered=True, aligned_by="src", file_type=config.file_type)
+                    )
+                elif config.adj_list_type == "ordered_by_dest":
+                    edge.adj_lists.append(
+                        AdjList(ordered=True, aligned_by="dst", file_type=config.file_type)
+                    )
+                elif config.adj_list_type == "unordered_by_source":
+                    edge.adj_lists.append(
+                        AdjList(ordered=False, aligned_by="src", file_type=config.file_type)
+                    )
+                elif config.adj_list_type == "unordered_by_dest":
+                    edge.adj_lists.append(
+                        AdjList(ordered=False, aligned_by="dst", file_type=config.file_type)
+                    )
+                else:
+                    msg = "Invalid adj_list_type"
+                    raise ValueError(msg)
+            else:
+                for adj_list in edge.adj_lists:
+                    if adj_list.file_type is None:
+                        adj_list.file_type = config.file_type
+            for property_group in edge.property_groups:
+                if property_group.file_type is None:
+                    property_group.file_type = config.file_type
+            for source in edge.sources:
+                if source.file_type is None:
+                    source.file_type = config.source_file_type
+        return v
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index 72b797eed..433df3424 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -5,28 +5,41 @@
 
 import pandas as pd
 import typer
+import yaml
 
-from ._core import (
+from ._core import (  # type: ignore  # noqa: PGH003
     check_edge,
     check_graph,
     check_vertex,
+    do_import,
+    get_edge_count,
     get_edge_types,
+    get_vertex_count,
     get_vertex_types,
     show_edge,
     show_graph,
     show_vertex,
 )
 from .config import ImportConfig
-from .importer import do_import, validate
+from .importer import validate
 from .logging import setup_logging
 
-app = typer.Typer(help="GraphAr Cli", no_args_is_help=True, add_completion=False)
+app = typer.Typer(
+    help="GraphAr Cli",
+    no_args_is_help=True,
+    add_completion=False,
+    context_settings={"help_option_names": ["-h", "--help"]},
+)
 
 setup_logging()
 logger = getLogger(__name__)
 
 
-@app.command(context_settings={"help_option_names": ["-h", "--help"]}, help="Show the metadata")
+@app.command(
+    context_settings={"help_option_names": ["-h", "--help"]},
+    help="Show the metadata",
+    no_args_is_help=True,
+)
 def show(
     path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
     vertex: str = typer.Option(None, "--vertex", "-v", help="Vertex type to show"),
@@ -46,13 +59,14 @@ def show(
         if vertex not in vertex_types:
             logger.error("Vertex %s not found in the graph", vertex)
             raise typer.Exit(1)
+        logger.info("Vertex count: %s", get_vertex_count(path, vertex))
         logger.info(show_vertex(path, vertex))
-        raise typer.Exit(0)
+        raise typer.Exit()
     if edge or edge_src or edge_dst:
         if not (edge and edge_src and edge_dst):
             logger.error("Edge source, edge, and edge destination must all be set")
             raise typer.Exit(1)
-        edge_types = get_edge_types(path)
+        edge_types: List[List[str]] = get_edge_types(path)
         found = False
         for edge_type in edge_types:
             if edge_type[0] == edge_src and edge_type[1] == edge and edge_type[2] == edge_dst:
@@ -66,12 +80,17 @@ def show(
                 edge_dst,
             )
             raise typer.Exit(1)
-        logger.info(show_edge(path, edge))
-        raise typer.Exit(1)
+        logger.info("Edge count: %s", get_edge_count(path, edge_src, edge, edge_dst))
+        logger.info(show_edge(path, edge_src, edge, edge_dst))
+        raise typer.Exit()
     logger.info(show_graph(path))
 
 
-@app.command(context_settings={"help_option_names": ["-h", "--help"]}, help="Check the metadata")
+@app.command(
+    context_settings={"help_option_names": ["-h", "--help"]},
+    help="Check the metadata",
+    no_args_is_help=True,
+)
 def check(
     path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
 ):
@@ -104,15 +123,35 @@ def check(
     logger.info("Graph is valid")
 
 
-@app.command("import", context_settings={"help_option_names": ["-h", "--help"]}, help="Import data")
+@app.command(
+    "import",
+    context_settings={"help_option_names": ["-h", "--help"]},
+    help="Import data",
+    no_args_is_help=True,
+)
 def import_data(
-    config_file: str = typer.Option(None, "--config", "-c", help="Path to the GraphAr config file"),
+    config_file: str = typer.Option(None, "--config", "-c", help="Path of the GraphAr config file"),
+    config_type: str = typer.Option(
+        "yaml",
+        "--type",
+        "-t",
+        help="Type of config file",
+        show_choices=True,
+        show_default=True,
+    ),
 ):
-    if not Path(config_file).exists():
+    if not Path(config_file).is_file():
         logger.error("File not found: %s", config_file)
         raise typer.Exit(1)
-    with Path(config_file).open(encoding="utf-8") as file:
-        config = json.load(file)
+    if config_type == "json":
+        with Path(config_file).open(encoding="utf-8") as file:
+            config = json.load(file)
+    elif config_type == "yaml":
+        with Path(config_file).open(encoding="utf-8") as file:
+            config = yaml.safe_load(file)
+    else:
+        logger.error("Config type %s not supported", config_type)
+        raise typer.Exit(1)
     try:
         import_config = ImportConfig(**config)
         validate(import_config)
@@ -120,11 +159,12 @@ def import_data(
         logger.error("Invalid config: %s", e)
         raise typer.Exit(1) from None
     try:
-        do_import(import_config)
+        logger.info("Starting import")
+        res = do_import(import_config.model_dump())
+        logger.info(res)
     except Exception as e:
         logger.error("Import failed: %s", e)
         raise typer.Exit(1) from None
-    logger.info(config)
 
 
 @app.command(
@@ -136,16 +176,16 @@ def merge_data(
     output: str = typer.Option(None, "--output", "-o", help="Output file"),
 ):
     if type == "parquet":
-        data = pd.concat([pd.read_parquet(file) for file in files], axis=1)
+        data = pd.concat([pd.read_parquet(file) for file in files], ignore_index=True)
         data.to_parquet(output)
     elif type == "csv":
-        data = pd.concat([pd.read_csv(file) for file in files], axis=1)
+        data = pd.concat([pd.read_csv(file) for file in files], ignore_index=True)
         data.to_csv(output)
     elif type == "orc":
-        data = pd.concat([pd.read_orc(file) for file in files], axis=1)
+        data = pd.concat([pd.read_orc(file) for file in files], ignore_index=True)
         data.to_orc(output)
     elif type == "json":
-        data = pd.concat([pd.read_json(file) for file in files], axis=1)
+        data = pd.concat([pd.read_json(file) for file in files], ignore_index=True)
         data.to_json(output)
     else:
         logger.error("Type %s not supported", type)
diff --git a/cli/src/importer.h b/cli/src/importer.h
new file mode 100644
index 000000000..d1672ee99
--- /dev/null
+++ b/cli/src/importer.h
@@ -0,0 +1,217 @@
+#include <filesystem>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <graphar/api/high_level_writer.h>
+
+namespace py = pybind11;
+namespace fs = std::filesystem;
+
+struct GraphArConfig {
+  std::string path;
+  std::string name;
+  std::string version;
+};
+
+struct Property {
+  std::string name;
+  std::string data_type;
+  bool is_primary;
+  bool nullable;
+};
+
+struct PropertyGroup {
+  std::string file_type;
+  std::vector<Property> properties;
+};
+
+struct Source {
+  std::string file_type;
+  std::string path;
+  char delimiter;
+  std::map<std::string, std::string> columns;
+};
+
+struct Vertex {
+  std::string type;
+  std::vector<std::string> labels;
+  int chunk_size;
+  std::string prefix;
+  std::vector<PropertyGroup> property_groups;
+  std::vector<Source> sources;
+};
+
+struct AdjList {
+  bool ordered;
+  std::string aligned_by;
+  std::string file_type;
+};
+
+struct Edge {
+  std::string edge_type;
+  std::string src_type;
+  std::string src_prop;
+  std::string dst_type;
+  std::string dst_prop;
+  std::vector<std::string> labels;
+  int chunk_size;
+  std::string prefix;
+  std::vector<AdjList> adj_lists;
+  std::vector<PropertyGroup> property_groups;
+  std::vector<Source> sources;
+};
+
+struct ImportSchema {
+  std::vector<Vertex> vertices;
+  std::vector<Edge> edges;
+};
+
+struct ImportConfig {
+  GraphArConfig graphar_config;
+  ImportSchema import_schema;
+};
+
+ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
+  ImportConfig import_config;
+
+  auto graphar_dict = config_dict["graphar"].cast<py::dict>();
+  import_config.graphar_config.path = graphar_dict["path"].cast<std::string>();
+  import_config.graphar_config.name = graphar_dict["name"].cast<std::string>();
+  import_config.graphar_config.version =
+      graphar_dict["version"].cast<std::string>();
+
+  auto schema_dict = config_dict["import_schema"].cast<py::dict>();
+
+  auto vertices_list = schema_dict["vertices"].cast<std::vector<py::dict>>();
+  for (const auto& vertex_dict : vertices_list) {
+    Vertex vertex;
+    vertex.type = vertex_dict["type"].cast<std::string>();
+    vertex.chunk_size = vertex_dict["chunk_size"].cast<int>();
+    vertex.prefix = vertex_dict["prefix"].cast<std::string>();
+    vertex.labels = vertex_dict["labels"].cast<std::vector<std::string>>();
+
+    auto pg_list = vertex_dict["property_groups"].cast<std::vector<py::dict>>();
+    for (const auto& pg_dict : pg_list) {
+      PropertyGroup pg;
+      pg.file_type = pg_dict["file_type"].cast<std::string>();
+
+      auto prop_list = pg_dict["properties"].cast<std::vector<py::dict>>();
+      for (const auto& prop_dict : prop_list) {
+        Property prop;
+        prop.name = prop_dict["name"].cast<std::string>();
+        prop.data_type = prop_dict["data_type"].cast<std::string>();
+        prop.is_primary = prop_dict["is_primary"].cast<bool>();
+        prop.nullable = prop_dict["nullable"].cast<bool>();
+        pg.properties.push_back(prop);
+      }
+      vertex.property_groups.push_back(pg);
+    }
+
+    auto source_list = vertex_dict["sources"].cast<std::vector<py::dict>>();
+    for (const auto& source_dict : source_list) {
+      Source src;
+      src.file_type = source_dict["file_type"].cast<std::string>();
+      src.path = source_dict["path"].cast<std::string>();
+      src.delimiter = source_dict["delimiter"].cast<char>();
+      src.columns =
+          source_dict["columns"].cast<std::map<std::string, std::string>>();
+
+      vertex.sources.push_back(src);
+    }
+
+    import_config.import_schema.vertices.push_back(vertex);
+  }
+
+  auto edges_list = schema_dict["edges"].cast<std::vector<py::dict>>();
+  for (const auto& edge_dict : edges_list) {
+    Edge edge;
+    edge.edge_type = edge_dict["edge_type"].cast<std::string>();
+    edge.src_type = edge_dict["src_type"].cast<std::string>();
+    edge.src_prop = edge_dict["src_prop"].cast<std::string>();
+    edge.dst_type = edge_dict["dst_type"].cast<std::string>();
+    edge.dst_prop = edge_dict["dst_prop"].cast<std::string>();
+    edge.labels = edge_dict["labels"].cast<std::vector<std::string>>();
+    edge.chunk_size = edge_dict["chunk_size"].cast<int>();
+    edge.prefix = edge_dict["prefix"].cast<std::string>();
+
+    auto adj_list_dicts = edge_dict["adj_lists"].cast<std::vector<py::dict>>();
+    for (const auto& adj_list_dict : adj_list_dicts) {
+      AdjList adj_list;
+      adj_list.ordered = adj_list_dict["ordered"].cast<bool>();
+      adj_list.aligned_by = adj_list_dict["aligned_by"].cast<std::string>();
+      adj_list.file_type = adj_list_dict["file_type"].cast<std::string>();
+      edge.adj_lists.push_back(adj_list);
+    }
+
+    auto edge_pg_list =
+        edge_dict["property_groups"].cast<std::vector<py::dict>>();
+    for (const auto& edge_pg_dict : edge_pg_list) {
+      PropertyGroup edge_pg;
+      edge_pg.file_type = edge_pg_dict["file_type"].cast<std::string>();
+      auto edge_prop_list =
+          edge_pg_dict["properties"].cast<std::vector<py::dict>>();
+
+      for (const auto& prop_dict : edge_prop_list) {
+        Property edge_prop;
+        edge_prop.name = prop_dict["name"].cast<std::string>();
+        edge_prop.data_type = prop_dict["data_type"].cast<std::string>();
+        edge_prop.is_primary = prop_dict["is_primary"].cast<bool>();
+        edge_prop.nullable = prop_dict["nullable"].cast<bool>();
+        edge_pg.properties.push_back(edge_prop);
+      }
+
+      edge.property_groups.push_back(edge_pg);
+    }
+
+    auto edge_source_list = edge_dict["sources"].cast<std::vector<py::dict>>();
+    for (const auto& edge_source_dict : edge_source_list) {
+      Source edge_src;
+      edge_src.file_type = edge_source_dict["file_type"].cast<std::string>();
+      edge_src.path = edge_source_dict["path"].cast<std::string>();
+      edge_src.delimiter = edge_source_dict["delimiter"].cast<char>();
+      edge_src.columns = edge_source_dict["columns"]
+                             .cast<std::map<std::string, std::string>>();
+
+      edge.sources.push_back(edge_src);
+    }
+
+    import_config.import_schema.edges.push_back(edge);
+  }
+
+  return import_config;
+}
+
+std::string DoImport(const py::dict& config_dict) {
+  auto import_config = ConvertPyDictToConfig(config_dict);
+
+  auto version =
+      graphar::InfoVersion::Parse(import_config.graphar_config.version).value();
+  fs::path save_path = import_config.graphar_config.path;
+
+  for (const auto& vertex : import_config.import_schema.vertices) {
+    auto pgs = std::vector<std::shared_ptr<graphar::PropertyGroup>>();
+
+    for (const auto& pg : vertex.property_groups) {
+      std::vector<graphar::Property> props;
+      for (const auto& prop : pg.properties) {
+        props.push_back(graphar::Property(
+            prop.name, graphar::DataType::TypeNameToDataType(prop.data_type),
+            prop.is_primary, prop.nullable));
+      }
+      // TODO: add prefix parameter in config
+      auto property_group = graphar::CreatePropertyGroup(
+          props, graphar::StringToFileType(pg.file_type));
+      pgs.push_back(property_group);
+    }
+
+    auto vertex_info = graphar::CreateVertexInfo(vertex.type, vertex.chunk_size,
+                                                 pgs, vertex.prefix, version);
+    auto file_name = vertex.type + ".vertex.yml";
+    vertex_info->Save(save_path / file_name);
+    // TODO: add start_index in config
+    graphar::IdType start_index = 0;
+  }
+
+  return "Importing...";
+}
\ No newline at end of file
diff --git a/cli/src/main.cc b/cli/src/main.cc
index 3387dca46..2ed2e4157 100644
--- a/cli/src/main.cc
+++ b/cli/src/main.cc
@@ -1,7 +1,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <graphar/filesystem.h>
 #include <graphar/graph_info.h>
+#include <graphar/reader_util.h>
+#include "importer.h"
 
 #define STRINGIFY(x) #x
 #define MACRO_STRINGIFY(x) STRINGIFY(x)
@@ -44,6 +47,64 @@ bool CheckEdge(const std::string& path, const std::string& src_type,
   return edge_info->IsValidated();
 }
 
+int64_t GetVertexCount(const std::string& path,
+                       const std::string& vertex_type) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto graph_prefix = graph_info->GetPrefix();
+  auto vertex_info = graph_info->GetVertexInfo(vertex_type);
+  return graphar::util::GetVertexNum(graph_prefix, vertex_info).value();
+}
+
+// TODO(ljj): Add this to graphar library
+
+std::vector<graphar::AdjListType> _GetAdjListTypes(
+    const std::shared_ptr<graphar::EdgeInfo>& edge_info) {
+  std::vector<graphar::AdjListType> adj_list_types;
+  if (edge_info->HasAdjacentListType(graphar::AdjListType::ordered_by_dest)) {
+    adj_list_types.push_back(graphar::AdjListType::ordered_by_dest);
+  }
+  if (edge_info->HasAdjacentListType(graphar::AdjListType::ordered_by_source)) {
+    adj_list_types.push_back(graphar::AdjListType::ordered_by_source);
+  }
+  if (edge_info->HasAdjacentListType(graphar::AdjListType::unordered_by_dest)) {
+    adj_list_types.push_back(graphar::AdjListType::unordered_by_dest);
+  }
+  if (edge_info->HasAdjacentListType(
+          graphar::AdjListType::unordered_by_source)) {
+    adj_list_types.push_back(graphar::AdjListType::unordered_by_source);
+  }
+  if (adj_list_types.empty()) {
+    throw std::runtime_error("No valid adj list type found");
+  }
+  return adj_list_types;
+}
+
+int64_t GetEdgeCount(const std::string& path, const std::string& src_type,
+                     const std::string& edge_type,
+                     const std::string& dst_type) {
+  auto graph_info = graphar::GraphInfo::Load(path).value();
+  auto graph_prefix = graph_info->GetPrefix();
+  auto edge_info = graph_info->GetEdgeInfo(src_type, edge_type, dst_type);
+  auto adj_list_types = _GetAdjListTypes(edge_info);
+  auto adj_list_type = adj_list_types[0];
+  auto vertices_num_file_path =
+      edge_info->GetVerticesNumFilePath(adj_list_type).value();
+  std::string base_dir;
+  auto fs = graphar::FileSystemFromUriOrPath(graph_prefix, &base_dir).value();
+  std::string vertices_num_path = base_dir + vertices_num_file_path;
+  auto vertices_num = fs->ReadFileToValue<int64_t>(vertices_num_path).value();
+  int max_chunk_index = (vertices_num + edge_info->GetSrcChunkSize() - 1) /
+                        edge_info->GetSrcChunkSize();
+  int64_t edge_count = 0;
+  for (int i = 0; i < max_chunk_index; i++) {
+    // TODO: file may not exist
+    edge_count +=
+        graphar::util::GetEdgeNum(graph_prefix, edge_info, adj_list_type, i)
+            .value();
+  }
+  return edge_count;
+}
+
 std::vector<std::string> GetVertexTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto vertex_infos = graph_info->GetVertexInfos();
@@ -60,8 +121,8 @@ std::vector<std::vector<std::string>> GetEdgeTypes(const std::string& path) {
   std::vector<std::vector<std::string>> edge_types;
   for (const auto& edge_info : edge_infos) {
     std::vector<std::string> edge_type;
-    edge_type.push_back(edge_info->GetEdgeType());
     edge_type.push_back(edge_info->GetSrcType());
+    edge_type.push_back(edge_info->GetEdgeType());
     edge_type.push_back(edge_info->GetDstType());
     edge_types.push_back(edge_type);
   }
@@ -79,7 +140,9 @@ PYBIND11_MODULE(_core, m) {
   m.def("check_edge", &CheckEdge, "Check the edge info");
   m.def("get_vertex_types", &GetVertexTypes, "Get the vertex types");
   m.def("get_edge_types", &GetEdgeTypes, "Get the edge types");
-
+  m.def("get_vertex_count", &GetVertexCount, "Get the vertex count");
+  m.def("get_edge_count", &GetEdgeCount, "Get the edge count");
+  m.def("do_import", &DoImport, "Do the import");
 #ifdef VERSION_INFO
   m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO);
 #else

From dd72ad84df63dd15110e63141e5b2e7b565bc7a4 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Sun, 20 Oct 2024 03:44:28 +0800
Subject: [PATCH 05/30] change config

---
 cli/import.mini.yml             |  71 +++++++++++++++++
 cli/src/graphar_cli/config.py   |  15 +++-
 cli/src/graphar_cli/importer.py | 131 +++++++++++++++++++-------------
 3 files changed, 163 insertions(+), 54 deletions(-)
 create mode 100644 cli/import.mini.yml

diff --git a/cli/import.mini.yml b/cli/import.mini.yml
new file mode 100644
index 000000000..7ca3ea392
--- /dev/null
+++ b/cli/import.mini.yml
@@ -0,0 +1,71 @@
+graphar:
+  path: /workspaces/incubator-graphar/graphar/movie
+  name: MovieGraph
+
+import_schema:
+  vertices:
+    - type: Person
+      property_groups:
+        - properties:
+            - name: name
+              data_type: string
+              is_primary: true
+            - name: born
+              data_type: int64
+      sources:
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person.parquet
+          columns:
+            name: name
+            born: born
+    - type: Movie
+      property_groups:
+        - properties:
+            - name: title
+              data_type: string
+              is_primary: true
+            - name: tagline
+              data_type: string
+      sources:
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.parquet
+          columns:
+            title: title
+            tagline: tagline
+  edges:
+    - edge_type: WROTE
+      src_type: Person
+      src_prop: name
+      dst_type: Movie
+      dst_prop: title
+      sources:
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv
+          columns:
+            srcIndex: _graphArSrcIndex
+            dstIndex: _graphArDstIndex
+    - edge_type: ACTED_IN
+      src_type: Person
+      src_prop: name
+      dst_type: Movie
+      dst_prop: title
+      sources:
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.csv
+          columns:
+            srcIndex: _graphArSrcIndex
+            dstIndex: _graphArDstIndex
+    - edge_type: REVIEWED
+      src_type: Person
+      src_prop: name
+      dst_type: Movie
+      dst_prop: title
+      property_groups:
+        - properties:
+          - name: rating
+            data_type: int64
+          - name: summary
+            data_type: string
+      sources:
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv
+          columns:
+            srcIndex: _graphArSrcIndex
+            dstIndex: _graphArDstIndex
+            summary: summary
+            rating: rating
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index 6e48eb8df..eb2391c3f 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -1,9 +1,12 @@
+from logging import getLogger
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional
 
 from pydantic import BaseModel, field_validator, model_validator
 from typing_extensions import Self
 
+logger = getLogger("graphar_cli")
+
 default_file_type = "parquet"
 default_source_file_type = "csv"
 default_adj_list_type = "ordered_by_source"
@@ -23,6 +26,16 @@ class GraphArConfig(BaseModel):
     source_file_type: Literal["parquet", "orc", "csv", "json"] = default_source_file_type
     version: Optional[str] = default_version
 
+    @field_validator("path")
+    def check_path(cls, v):
+        path = Path(v)
+        if not path.exists():
+            path.mkdir(parents=True, exist_ok=True)
+        elif any(path.iterdir()):
+            msg = f"Warning: Path {v} already exists and contains files."
+            logger.warning(msg)
+        return v
+
 
 class Property(BaseModel):
     name: str
@@ -38,7 +51,7 @@ def check_nullable(self) -> Self:
         if self.is_primary:
             self.nullable = False
         elif self.nullable is None:
-                self.nullable = True
+            self.nullable = True
         return self
 
 
diff --git a/cli/src/graphar_cli/importer.py b/cli/src/graphar_cli/importer.py
index 66f603d05..bd7b621d3 100644
--- a/cli/src/graphar_cli/importer.py
+++ b/cli/src/graphar_cli/importer.py
@@ -1,76 +1,101 @@
+from logging import getLogger
+
 from .config import ImportConfig
 
+logger = getLogger("graphar_cli")
+
 
-def validate(config: ImportConfig):
+def validate(import_config: ImportConfig):
     vertex_types = set()
-    for vertex in config.import_schema.vertices:
+    for vertex in import_config.import_schema.vertices:
         if vertex.type in vertex_types:
             msg = f"Duplicate vertex type {vertex.type}"
             raise ValueError(msg)
         vertex_types.add(vertex.type)
 
         prop_names = set()
-        for prop in vertex.properties:
-            if prop.name in prop_names:
-                msg = f"Duplicate property {prop.name} in vertex properties"
-                raise ValueError(msg)
-            prop_names.add(prop.name)
-            if prop.name == vertex.primary and prop.nullable:
-                msg = f"Primary key {vertex.primary} cannot be nullable"
-                raise ValueError(msg)
-        if vertex.primary not in prop_names:
-            msg = f"Primary key {vertex.primary} not found in vertex properties"
-            raise ValueError(msg)
-        prop_names_in_groups = set()
-        for prop_group in vertex.propertyGroups:
-            for prop_name in prop_group:
-                if prop_name not in prop_names:
-                    msg = f"Property {prop} not found in vertex properties"
-                    raise ValueError(msg)
-                if prop_name in prop_names_in_groups:
-                    msg = f"Property {prop} found in multiple property groups"
+        primary_keys = []
+        for prop_group in vertex.property_groups:
+            for prop in prop_group.properties:
+                if prop.name in prop_names:
+                    msg = f"Duplicate property `{prop.name}` in vertex `{vertex.type}`"
                     raise ValueError(msg)
-                prop_names_in_groups.add(prop_name)
+                prop_names.add(prop.name)
+                if prop.is_primary:
+                    if len(primary_keys):
+                        msg = (
+                            f"Multiple primary keys `{primary_keys[0]}` and `{prop.name}` "
+                            f"found in vertex `{vertex.type}`"
+                        )
+                        raise ValueError(msg)
+                    primary_keys.append(prop.name)
+                    if prop.nullable:
+                        msg = f"Primary key `{prop.name}` in `{vertex.type}` cannot be nullable"
+                        raise ValueError(msg)
+        source_keys = [key for source in vertex.sources for key in source.columns]
         for prop_name in prop_names:
-            if prop_name not in vertex.source.columns:
-                msg = f"Property {prop_name} not found in source columns"
+            if prop_name not in source_keys:
+                msg = (
+                    f"Property `{prop_name}` in vertex `{vertex.type}` not found in source columns"
+                )
                 raise ValueError(msg)
-        
+        logger.debug("Validated vertex %s", vertex.type)
+
     edge_types = set()
-    for edge in config.import_schema.edges:
-        if edge.type in edge_types:
+    for edge in import_config.import_schema.edges:
+        if edge.edge_type in edge_types:
             msg = f"Duplicate edge type {edge.type}"
             raise ValueError(msg)
-        edge_types.add(edge.type)
+        edge_types.add(edge.edge_type)
 
-        if edge.srcType not in vertex_types:
-            msg = f"Source vertex type {edge.srcType} not found"
+        if edge.src_type not in vertex_types:
+            msg = f"Source vertex type {edge.src_type} not found"
             raise ValueError(msg)
-        if edge.dstType not in vertex_types:
-            msg = f"Destination vertex type {edge.dstType} not found"
+        if edge.dst_type not in vertex_types:
+            msg = f"Destination vertex type {edge.dst_type} not found"
+            raise ValueError(msg)
+        src_vertex = next(
+            vertex
+            for vertex in import_config.import_schema.vertices
+            if vertex.type == edge.src_type
+        )
+        if edge.src_prop not in [
+            prop.name for prop_group in src_vertex.property_groups for prop in prop_group.properties
+        ]:
+            msg = (
+                f"Source property `{edge.src_prop}` "
+                f"not found in source vertex `{edge.src_type}` "
+                f"in edge `{edge.edge_type}`"
+            )
+            raise ValueError(msg)
+        dst_vertex = next(
+            vertex
+            for vertex in import_config.import_schema.vertices
+            if vertex.type == edge.dst_type
+        )
+        if edge.dst_prop not in [
+            prop.name for prop_group in dst_vertex.property_groups for prop in prop_group.properties
+        ]:
+            msg = (
+                f"Destination property `{edge.dst_prop}` "
+                f"not found in destination vertex `{edge.dst_type}` "
+                f"in edge `{edge.edge_type}`"
+            )
             raise ValueError(msg)
-        # TODO: Check that source and destination properties are present in the source vertex
         prop_names = set()
-        for prop in edge.properties:
-            if prop.name in prop_names:
-                msg = f"Duplicate property {prop.name} in edge properties"
-                raise ValueError(msg)
-            prop_names.add(prop.name)
-        prop_names_in_groups = set()
-        for prop_group in edge.propertyGroups:
-            for prop_name in prop_group:
-                if prop_name not in prop_names:
-                    msg = f"Property {prop} not found in edge properties"
+        for prop_group in edge.property_groups:
+            for prop in prop_group.properties:
+                if prop.name in prop_names:
+                    msg = f"Duplicate property `{prop.name}` in edge `{edge.edge_type}`"
                     raise ValueError(msg)
-                if prop_name in prop_names_in_groups:
-                    msg = f"Property {prop} found in multiple property groups"
-                    raise ValueError(msg)
-                prop_names_in_groups.add(prop_name)
+                prop_names.add(prop.name)
+
+        source_keys = [key for source in edge.sources for key in source.columns]
         for prop_name in prop_names:
-            if prop_name not in edge.source.columns:
-                msg = f"Property {prop_name} not found in source columns"
+            if prop_name not in source_keys:
+                msg = (
+                    f"Property `{prop_name}` in edge `{edge.edge_type}` "
+                    f"not found in source columns"
+                )
                 raise ValueError(msg)
-        
-
-def do_import(config: ImportConfig):
-    pass
\ No newline at end of file
+        logger.debug("Validated edge %s %s %s", edge.src_type, edge.edge_type, edge.dst_type)

From 5f7ed3c99cd274eb5b1fdb4c690e6a82e3161797 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 17:23:44 +0800
Subject: [PATCH 06/30] finish

---
 .github/workflows/cli.yml            |   6 +-
 cli/README.md                        |  59 +++-
 cli/import.full.json                 | 223 +++++++++++++
 cli/import.full.yml                  |  41 +--
 cli/import.mini.json                 | 130 ++++++++
 cli/import.mini.yml                  |  16 +-
 cli/pyproject.toml                   |   9 +-
 cli/src/graphar_cli/config.py        |  75 +++--
 cli/src/graphar_cli/graphar_cli.py   |  30 +-
 cli/src/graphar_cli/importer.py      |  38 ++-
 cli/src/importer.h                   | 311 ++++++++++++++++--
 cli/src/util.h                       | 464 +++++++++++++++++++++++++++
 cli/test/merge.py                    |  86 +++++
 cli/test/test_basic.py               |   7 +
 cpp/src/graphar/arrow/chunk_writer.h |   8 +-
 cpp/src/graphar/graph_info.h         |   2 +
 16 files changed, 1369 insertions(+), 136 deletions(-)
 create mode 100644 cli/import.full.json
 create mode 100644 cli/import.mini.json
 create mode 100644 cli/src/util.h
 create mode 100644 cli/test/merge.py
 create mode 100644 cli/test/test_basic.py

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index 0de3eeb44..f9d61b793 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -97,8 +97,12 @@ jobs:
         brew bundle --file=cpp/Brewfile
         
     
-    - name: Build GraphAr
+    - name: Build GraphAr And Run Tests
       working-directory: "cli"
       run: |
         pip install ./ -vvv
         graphar-cli --help
+        graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
+        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person
+        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+        graphar-cli import -c import.mini.yml
diff --git a/cli/README.md b/cli/README.md
index eb4eda2cf..369fc1b02 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -1,4 +1,4 @@
-# GraphAr Cli (Under Development)
+# GraphAr Cli
 
 GraphAr Cli uses [pybind11][] and [scikit-build-core][] to bind C++ code into Python and build command line tools through Python. Command line tools developed using [typer][].
 
@@ -8,7 +8,7 @@ GraphAr Cli uses [pybind11][] and [scikit-build-core][] to bind C++ code into Py
 
 ## Requirements
 
-- Ubuntu 22.04
+- Linux (work fine on Ubuntu 22.04)
 - Cmake >= 3.15
 - Arrow >= 12.0
 - Python >= 3.7
@@ -28,6 +28,61 @@ And using Python in conda or venv is a good choice.
 
 ```bash
 graphar-cli --help
+
+# check the metadata, verify whether the vertex edge information and attribute information of the graph are valid
+graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
+
+# show the vertex
+graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person
+
+# show the edge
+graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+
+# import graph data by using a config file
+graphar-cli import -c import.mini.yml
 ```
 
+## Import config file
+
+The config file supports two data types. We provide two reference templates for each type: full and mini.
+
+The full version of the configuration file contains all configurable fields, and additional fields will be automatically ignored.
+
+The mini version of the configuration file is a simplified version of the full configuration file, retaining the same functionality. It shows the essential parts of the configuration information. 
+
+For the full configuration file, if all fields can be set to their default values, you can simplify it to the mini version. However, it cannot be further reduced beyond the mini version.
+
+
+In the yaml config files, we provide brief comments on the fields, which can be used as a reference
+
+**Example**
+
+To import the movie graph data from the `testing` directory, you first need to prepare data files. Supported file types include `csv`, `json`(as well as`jsonline`, but should have the `.json` extension), `parquet`, and `orc` files. Please ensure the correct file extensions are set in advance, or specify the file type in the source section of the configuration.
+
+Next, write a configuration file following the provided sample. Any empty fields in the `graphar` configuration will be filled with default values. In the `import_schema`, empty fields will use the global configuration values from `graphar`. If fields in `import_schema` are not empty, they will override the values from `graphar`.
+
+A few important notes:
+
+1. The sources list specifies configuration for the data source files. For `csv` files, you can set the `delimiter`. The format of the `json` file should be given in the format of `jsonline`.
+
+2. The columns dictionary maps column names in the data source to node or edge properties. Keys represent column names in the data source, and values represent property names.
+
+3. Currently, edge properties cannot have the same names as the edge endpoints' properties; doing so will raise an exception.
+
+4. The following table lists the default fields, more of which are included in the full configuration.
+
+
+| Field                             | Default value        |
+| -----------                       | -----------          |
+|  `graphar.vertex_chunk_size`      | `100`                |
+|  `graphar.edge_chunk_size`        | `1024`               |
+|  `graphar.file_type`              | `parquet`            |
+|  `graphar.adj_list_type`          | `ordered_by_source`  |
+|  `graphar.validate_level`         | `weak`               |
+|  `graphar.version`                | `gar/v1`             |
+|  `property.nullable`              | `true`               |
+
+
+
 
+Wish you a happy use！
\ No newline at end of file
diff --git a/cli/import.full.json b/cli/import.full.json
new file mode 100644
index 000000000..86c6b8f0c
--- /dev/null
+++ b/cli/import.full.json
@@ -0,0 +1,223 @@
+{
+    "graphar": {
+        "path": "/workspaces/incubator-graphar/graphar/movie",
+        "name": "MovieGraph",
+        "vertex_chunk_size": 100,
+        "edge_chunk_size": 1024,
+        "file_type": "parquet",
+        "adj_list_type": "ordered_by_source",
+        "validate_level": "weak",
+        "version": "gar/v1"
+    },
+    "import_schema": {
+        "vertices": [
+            {
+                "type": "Person",
+                "labels": [
+                    "Person"
+                ],
+                "chunk_size": 100,
+                "validate_level": "no",
+                "prefix": "vertex/Person/",
+                "property_groups": [
+                    {
+                        "file_type": "parquet",
+                        "properties": [
+                            {
+                                "name": "name",
+                                "data_type": "string",
+                                "is_primary": true,
+                                "nullable": false
+                            },
+                            {
+                                "name": "born",
+                                "data_type": "int64",
+                                "is_primary": false,
+                                "nullable": true
+                            }
+                        ]
+                    }
+                ],
+                "sources": [
+                    {
+                        "file_type": "parquet",
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person.parquet",
+                        "columns": {
+                            "name": "name",
+                            "born": "born"
+                        }
+                    }
+                ]
+            },
+            {
+                "type": "Movie",
+                "labels": [
+                    "Movie"
+                ],
+                "chunk_size": 100,
+                "validate_level": "no",
+                "prefix": "vertex/Movie/",
+                "property_groups": [
+                    {
+                        "file_type": "parquet",
+                        "properties": [
+                            {
+                                "name": "title",
+                                "data_type": "string",
+                                "is_primary": true,
+                                "nullable": false
+                            },
+                            {
+                                "name": "tagline",
+                                "data_type": "string",
+                                "is_primary": false,
+                                "nullable": true
+                            }
+                        ]
+                    }
+                ],
+                "sources": [
+                    {
+                        "file_type": "orc",
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Movie.orc",
+                        "columns": {
+                            "title": "title",
+                            "tagline": "tagline"
+                        }
+                    }
+                ]
+            }
+        ],
+        "edges": [
+            {
+                "edge_type": "WROTE",
+                "src_type": "Person",
+                "src_prop": "name",
+                "dst_type": "Movie",
+                "dst_prop": "title",
+                "labels": [
+                    "WROTE"
+                ],
+                "chunk_size": 1024,
+                "validate_level": "no",
+                "prefix": "edge/Person_WROTE_Movie/",
+                "adj_lists": [
+                    {
+                        "ordered": true,
+                        "aligned_by": "src",
+                        "file_type": "parquet"
+                    },
+                    {
+                        "ordered": true,
+                        "aligned_by": "dst",
+                        "file_type": "parquet"
+                    }
+                ],
+                "sources": [
+                    {
+                        "file_type": "csv",
+                        "delimiter": ",",
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv",
+                        "columns": {
+                            "name": "name",
+                            "title": "title"
+                        }
+                    }
+                ]
+            },
+            {
+                "edge_type": "ACTED_IN",
+                "src_type": "Person",
+                "src_prop": "name",
+                "dst_type": "Movie",
+                "dst_prop": "title",
+                "validate_level": "no",
+                "labels": [
+                    "ACTED_IN"
+                ],
+                "chunk_size": 1024,
+                "prefix": "edge/Person_ACTED_IN_Movie/",
+                "adj_lists": [
+                    {
+                        "ordered": true,
+                        "aligned_by": "src",
+                        "file_type": "parquet"
+                    },
+                    {
+                        "ordered": true,
+                        "aligned_by": "dst",
+                        "file_type": "parquet"
+                    }
+                ],
+                "sources": [
+                    {
+                        "file_type": "json",
+                        "delimiter": ",",
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.json",
+                        "columns": {
+                            "name": "name",
+                            "title": "title"
+                        }
+                    }
+                ]
+            },
+            {
+                "edge_type": "REVIEWED",
+                "src_type": "Person",
+                "src_prop": "name",
+                "dst_type": "Movie",
+                "dst_prop": "title",
+                "labels": [
+                    "Review"
+                ],
+                "chunk_size": 1024,
+                "validate_level": "no",
+                "prefix": "edge/Person_REVIEWED_Movie/",
+                "property_groups": [
+                    {
+                        "file_type": "parquet",
+                        "properties": [
+                            {
+                                "name": "rating",
+                                "data_type": "int64",
+                                "is_primary": false,
+                                "nullable": true
+                            },
+                            {
+                                "name": "summary",
+                                "data_type": "string",
+                                "is_primary": false,
+                                "nullable": true
+                            }
+                        ]
+                    }
+                ],
+                "adj_lists": [
+                    {
+                        "ordered": true,
+                        "aligned_by": "src",
+                        "file_type": "parquet"
+                    },
+                    {
+                        "ordered": true,
+                        "aligned_by": "dst",
+                        "file_type": "parquet"
+                    }
+                ],
+                "sources": [
+                    {
+                        "file_type": "csv",
+                        "delimiter": ",",
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv",
+                        "columns": {
+                            "name": "name",
+                            "title": "title",
+                            "summary": "summary",
+                            "rating": "rating"
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+}
\ No newline at end of file
diff --git a/cli/import.full.yml b/cli/import.full.yml
index cacdeed6d..36e571b2f 100644
--- a/cli/import.full.yml
+++ b/cli/import.full.yml
@@ -1,12 +1,12 @@
-graphar:
-  path: /workspaces/incubator-graphar/graphar/movie
-  name: MovieGraph
-  vertex_chunk_size: 100
-  edge_chunk_size: 1024
-  file_type: parquet
-  adj_list_type: ordered_by_source
-  source_file_type: parquet
-  version: gar/v1
+graphar:                                              # Global configuration of imported data
+  path: /workspaces/incubator-graphar/graphar/movie   # (Required) Path to the graphar directory
+  name: MovieGraph                                    # (Required) Name of the graph
+  vertex_chunk_size: 100                              # Number of vertices per chunk
+  edge_chunk_size: 1024                               # Number of edges per chunk
+  file_type: parquet                                  # Default file type for vertices and edges, can be "parquet", "orc", "csv", "json"
+  adj_list_type: ordered_by_source                    # Default adjacency list type, can be "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
+  validate_level: weak                                # Default validation level for vertices and edges, can be "no", "weak", "strong"
+  version: gar/v1                                     # Version of the graphar schema
 
 import_schema:
   vertices:
@@ -14,6 +14,7 @@ import_schema:
       labels:
         - Person
       chunk_size: 100
+      validate_level: "no"
       prefix: vertex/Person/
       property_groups:
         - file_type: parquet
@@ -36,6 +37,7 @@ import_schema:
       labels:
         - Movie
       chunk_size: 100
+      validate_level: "no"
       prefix: vertex/Movie/
       property_groups:
         - file_type: parquet
@@ -49,8 +51,8 @@ import_schema:
               is_primary: false
               nullable: true
       sources:
-        - file_type: parquet
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.parquet
+        - file_type: orc
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.orc
           columns:
             title: title
             tagline: tagline
@@ -63,6 +65,7 @@ import_schema:
       labels:
         - WROTE
       chunk_size: 1024
+      validate_level: "no"
       prefix: edge/Person_WROTE_Movie/
       adj_lists:
         - ordered: true
@@ -76,13 +79,14 @@ import_schema:
           delimiter: ","
           path: /workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv
           columns:
-            srcIndex: _graphArSrcIndex
-            dstIndex: _graphArDstIndex
+            name: name
+            title: title
     - edge_type: ACTED_IN
       src_type: Person
       src_prop: name
       dst_type: Movie
       dst_prop: title
+      validate_level: "no"
       labels:
         - ACTED_IN
       chunk_size: 1024
@@ -95,12 +99,12 @@ import_schema:
           aligned_by: dst
           file_type: parquet
       sources:
-        - file_type: csv
-          delimiter: ","
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.csv
+        - file_type: json
+          delimiter: ","    # will be ignored
+          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.json
           columns:
-            srcIndex: _graphArSrcIndex
-            dstIndex: _graphArDstIndex
+            name: name
+            title: title
     - edge_type: REVIEWED
       src_type: Person
       src_prop: name
@@ -109,6 +113,7 @@ import_schema:
       labels:
         - Review
       chunk_size: 1024
+      validate_level: "no"
       prefix: edge/Person_REVIEWED_Movie/
       property_groups:
         - file_type: parquet
diff --git a/cli/import.mini.json b/cli/import.mini.json
new file mode 100644
index 000000000..8d85c77f0
--- /dev/null
+++ b/cli/import.mini.json
@@ -0,0 +1,130 @@
+{
+    "graphar": {
+        "path": "/workspaces/incubator-graphar/graphar/movie",
+        "name": "MovieGraph"
+    },
+    "import_schema": {
+        "vertices": [
+            {
+                "type": "Person",
+                "property_groups": [
+                    {
+                        "properties": [
+                            {
+                                "name": "name",
+                                "data_type": "string",
+                                "is_primary": true
+                            },
+                            {
+                                "name": "born",
+                                "data_type": "int64"
+                            }
+                        ]
+                    }
+                ],
+                "sources": [
+                    {
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person.parquet",
+                        "columns": {
+                            "name": "name",
+                            "born": "born"
+                        }
+                    }
+                ]
+            },
+            {
+                "type": "Movie",
+                "property_groups": [
+                    {
+                        "properties": [
+                            {
+                                "name": "title",
+                                "data_type": "string",
+                                "is_primary": true
+                            },
+                            {
+                                "name": "tagline",
+                                "data_type": "string"
+                            }
+                        ]
+                    }
+                ],
+                "sources": [
+                    {
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Movie.orc",
+                        "columns": {
+                            "title": "title",
+                            "tagline": "tagline"
+                        }
+                    }
+                ]
+            }
+        ],
+        "edges": [
+            {
+                "edge_type": "WROTE",
+                "src_type": "Person",
+                "src_prop": "name",
+                "dst_type": "Movie",
+                "dst_prop": "title",
+                "sources": [
+                    {
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv",
+                        "columns": {
+                            "name": "name",
+                            "title": "title"
+                        }
+                    }
+                ]
+            },
+            {
+                "edge_type": "ACTED_IN",
+                "src_type": "Person",
+                "src_prop": "name",
+                "dst_type": "Movie",
+                "dst_prop": "title",
+                "sources": [
+                    {
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.csv",
+                        "columns": {
+                            "name": "name",
+                            "title": "title"
+                        }
+                    }
+                ]
+            },
+            {
+                "edge_type": "REVIEWED",
+                "src_type": "Person",
+                "src_prop": "name",
+                "dst_type": "Movie",
+                "dst_prop": "title",
+                "property_groups": [
+                    {
+                        "properties": [
+                            {
+                                "name": "rating",
+                                "data_type": "int64"
+                            },
+                            {
+                                "name": "summary",
+                                "data_type": "string"
+                            }
+                        ]
+                    }
+                ],
+                "sources": [
+                    {
+                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv",
+                        "columns": {
+                            "name": "name",
+                            "title": "title",
+                            "summary": "summary",
+                            "rating": "rating"
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+}
\ No newline at end of file
diff --git a/cli/import.mini.yml b/cli/import.mini.yml
index 7ca3ea392..1b1a2a8f8 100644
--- a/cli/import.mini.yml
+++ b/cli/import.mini.yml
@@ -26,7 +26,7 @@ import_schema:
             - name: tagline
               data_type: string
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.parquet
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.orc
           columns:
             title: title
             tagline: tagline
@@ -39,18 +39,18 @@ import_schema:
       sources:
         - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv
           columns:
-            srcIndex: _graphArSrcIndex
-            dstIndex: _graphArDstIndex
+            name: name
+            title: title
     - edge_type: ACTED_IN
       src_type: Person
       src_prop: name
       dst_type: Movie
       dst_prop: title
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.csv
+        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.json
           columns:
-            srcIndex: _graphArSrcIndex
-            dstIndex: _graphArDstIndex
+            name: name
+            title: title
     - edge_type: REVIEWED
       src_type: Person
       src_prop: name
@@ -65,7 +65,7 @@ import_schema:
       sources:
         - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv
           columns:
-            srcIndex: _graphArSrcIndex
-            dstIndex: _graphArDstIndex
+            name: name
+            title: title
             summary: summary
             rating: rating
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index 2802fbfbd..889e3e157 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -10,13 +10,10 @@ description = "GraphAr command line tool"
 readme = "README.md"
 authors = [{ name = "GraphAr community", email = "dev@graphar.apache.org" }]
 requires-python = ">=3.7"
-dependencies = [
-  "typer ~= 0.12.3",
-  "pandas ~= 2.2.2",
-  "pydantic ~= 2.8.2",
-  "pyyaml ~= 6.0.2",
-]
+dependencies = ["typer ~= 0.12.3", "pydantic ~= 2.8.2", "pyyaml ~= 6.0.2"]
 
+[project.optional-dependencies]
+test = ["pandas ~= 2.2.2"]
 
 [project.scripts]
 graphar-cli = "graphar_cli.graphar_cli:main"
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index eb2391c3f..3bfdca5df 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -1,17 +1,19 @@
 from logging import getLogger
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional
 
 from pydantic import BaseModel, field_validator, model_validator
 from typing_extensions import Self
 
 logger = getLogger("graphar_cli")
 
+# TODO: convert to constants
 default_file_type = "parquet"
-default_source_file_type = "csv"
 default_adj_list_type = "ordered_by_source"
 default_regular_separator = "_"
+default_validate_level = "weak"
 default_version = "gar/v1"
+support_file_types = {"parquet", "orc", "csv", "json"}
 
 
 class GraphArConfig(BaseModel):
@@ -23,7 +25,7 @@ class GraphArConfig(BaseModel):
     adj_list_type: Literal[
         "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
     ] = default_adj_list_type
-    source_file_type: Literal["parquet", "orc", "csv", "json"] = default_source_file_type
+    validate_level: Literal["no", "weak", "strong"] = default_validate_level
     version: Optional[str] = default_version
 
     @field_validator("path")
@@ -87,11 +89,25 @@ def check_delimiter(cls, v):
             raise ValueError(msg)
         return v
 
+    @model_validator(mode="after")
+    def check_file_type(self) -> Self:
+        if not self.file_type:
+            file_type = Path(self.path).suffix.removeprefix(".")
+            if file_type == "":
+                msg = f"File {self.path} has no file type suffix"
+                raise ValueError(msg)
+            if file_type not in support_file_types:
+                msg = f"File type {file_type} not supported"
+                raise ValueError(msg)
+            self.file_type = file_type
+        return self
+
 
 class Vertex(BaseModel):
     type: str
     labels: List[str] = []
     chunk_size: Optional[int] = None
+    validate_level: Optional[Literal["no", "weak", "strong"]] = None
     prefix: Optional[str] = None
     property_groups: List[PropertyGroup]
     sources: List[Source]
@@ -133,6 +149,7 @@ class Edge(BaseModel):
     dst_prop: Optional[str] = None
     labels: List[str] = []
     chunk_size: Optional[int] = None
+    validate_level: Optional[Literal["no", "weak", "strong"]] = None
     adj_lists: List[AdjList] = []
     property_groups: List[PropertyGroup] = []
     sources: List[Source]
@@ -176,49 +193,43 @@ class ImportConfig(BaseModel):
     graphar: GraphArConfig
     import_schema: ImportSchema
 
-    @field_validator("import_schema")
-    def replace_none_values(cls, v: ImportSchema, values: Dict[str, Any]):
-        config = values.data["graphar"]
-        for vertex in v.vertices:
+    @model_validator(mode="after")
+    def check_none_types(self) -> Self:
+        for vertex in self.import_schema.vertices:
             if vertex.chunk_size is None:
-                vertex.chunk_size = config.vertex_chunk_size
+                vertex.chunk_size = self.graphar.vertex_chunk_size
+            if vertex.validate_level is None:
+                vertex.validate_level = self.graphar.validate_level
             for property_group in vertex.property_groups:
                 if property_group.file_type is None:
-                    property_group.file_type = config.file_type
-            for source in vertex.sources:
-                if source.file_type is None:
-                    source.file_type = config.source_file_type
-        for edge in v.edges:
+                    property_group.file_type = self.graphar.file_type
+        for edge in self.import_schema.edges:
             if edge.chunk_size is None:
-                edge.chunk_size = config.edge_chunk_size
+                edge.chunk_size = self.graphar.edge_chunk_size
+            if edge.validate_level is None:
+                edge.validate_level = self.graphar.validate_level
             if len(edge.adj_lists) == 0:
-                if config.adj_list_type == "ordered_by_source":
+                if self.graphar.adj_list_type == "ordered_by_source":
                     edge.adj_lists.append(
-                        AdjList(ordered=True, aligned_by="src", file_type=config.file_type)
+                        AdjList(ordered=True, aligned_by="src", file_type=self.graphar.file_type)
                     )
-                elif config.adj_list_type == "ordered_by_dest":
+                elif self.graphar.adj_list_type == "ordered_by_dest":
                     edge.adj_lists.append(
-                        AdjList(ordered=True, aligned_by="dst", file_type=config.file_type)
+                        AdjList(ordered=True, aligned_by="dst", file_type=self.graphar.file_type)
                     )
-                elif config.adj_list_type == "unordered_by_source":
+                elif self.graphar.adj_list_type == "unordered_by_source":
                     edge.adj_lists.append(
-                        AdjList(ordered=False, aligned_by="src", file_type=config.file_type)
+                        AdjList(ordered=False, aligned_by="src", file_type=self.graphar.file_type)
                     )
-                elif config.adj_list_type == "unordered_by_dest":
+                elif self.graphar.adj_list_type == "unordered_by_dest":
                     edge.adj_lists.append(
-                        AdjList(ordered=False, aligned_by="dst", file_type=config.file_type)
+                        AdjList(ordered=False, aligned_by="dst", file_type=self.graphar.file_type)
                     )
                 else:
-                    msg = "Invalid adj_list_type"
+                    msg = f"Invalid adj_list_type '{self.graphar.adj_list_type}'"
                     raise ValueError(msg)
-            else:
-                for adj_list in edge.adj_lists:
-                    if adj_list.file_type is None:
-                        adj_list.file_type = config.file_type
             for property_group in edge.property_groups:
                 if property_group.file_type is None:
-                    property_group.file_type = config.file_type
-            for source in edge.sources:
-                if source.file_type is None:
-                    source.file_type = config.source_file_type
-        return v
+                    property_group.file_type = self.graphar.file_type
+
+        return self
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index 433df3424..b9ec4f24f 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -41,7 +41,7 @@
     no_args_is_help=True,
 )
 def show(
-    path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
+    path: str = typer.Option(None, "--path", "-f", help="Path to the GraphAr config file"),
     vertex: str = typer.Option(None, "--vertex", "-v", help="Vertex type to show"),
     edge_src: str = typer.Option(None, "--edge-src", "-es", help="Source of the edge type to show"),
     edge: str = typer.Option(None, "--edge", "-e", help="Edge type to show"),
@@ -92,7 +92,7 @@ def show(
     no_args_is_help=True,
 )
 def check(
-    path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
+    path: str = typer.Option(None, "--path", "-f", help="Path to the GraphAr config file"),
 ):
     if not Path(path).exists():
         logger.error("File not found: %s", path)
@@ -167,31 +167,5 @@ def import_data(
         raise typer.Exit(1) from None
 
 
-@app.command(
-    "merge", context_settings={"help_option_names": ["-h", "--help"]}, help="Merge source files"
-)
-def merge_data(
-    files: List[str] = typer.Option(None, "--file", "-f", help="Files to merge"),  # noqa: B008
-    type: str = typer.Option(None, "--type", "-t", help="Type of data to merge"),
-    output: str = typer.Option(None, "--output", "-o", help="Output file"),
-):
-    if type == "parquet":
-        data = pd.concat([pd.read_parquet(file) for file in files], ignore_index=True)
-        data.to_parquet(output)
-    elif type == "csv":
-        data = pd.concat([pd.read_csv(file) for file in files], ignore_index=True)
-        data.to_csv(output)
-    elif type == "orc":
-        data = pd.concat([pd.read_orc(file) for file in files], ignore_index=True)
-        data.to_orc(output)
-    elif type == "json":
-        data = pd.concat([pd.read_json(file) for file in files], ignore_index=True)
-        data.to_json(output)
-    else:
-        logger.error("Type %s not supported", type)
-        raise typer.Exit(1)
-    logger.info("Merged data saved to %s", output)
-
-
 def main() -> None:
     app()
diff --git a/cli/src/graphar_cli/importer.py b/cli/src/graphar_cli/importer.py
index bd7b621d3..49d577193 100644
--- a/cli/src/graphar_cli/importer.py
+++ b/cli/src/graphar_cli/importer.py
@@ -18,27 +18,29 @@ def validate(import_config: ImportConfig):
         for prop_group in vertex.property_groups:
             for prop in prop_group.properties:
                 if prop.name in prop_names:
-                    msg = f"Duplicate property `{prop.name}` in vertex `{vertex.type}`"
+                    msg = f"Duplicate property '{prop.name}' in vertex '{vertex.type}'"
                     raise ValueError(msg)
                 prop_names.add(prop.name)
                 if prop.is_primary:
                     if len(primary_keys):
                         msg = (
-                            f"Multiple primary keys `{primary_keys[0]}` and `{prop.name}` "
-                            f"found in vertex `{vertex.type}`"
+                            f"Multiple primary keys '{primary_keys[0]}' and '{prop.name}' "
+                            f"found in vertex '{vertex.type}'"
                         )
                         raise ValueError(msg)
                     primary_keys.append(prop.name)
                     if prop.nullable:
-                        msg = f"Primary key `{prop.name}` in `{vertex.type}` cannot be nullable"
+                        msg = f"Primary key '{prop.name}' in '{vertex.type}' cannot be nullable"
                         raise ValueError(msg)
-        source_keys = [key for source in vertex.sources for key in source.columns]
+        source_values = [value for source in vertex.sources for value in source.columns.values()]
         for prop_name in prop_names:
-            if prop_name not in source_keys:
+            if prop_name not in source_values:
                 msg = (
-                    f"Property `{prop_name}` in vertex `{vertex.type}` not found in source columns"
+                    f"Property '{prop_name}' in vertex '{vertex.type}' not found in source columns"
                 )
                 raise ValueError(msg)
+        msg = f"Source columns are more than the properties in vertex '{vertex.type}'"
+        assert len(source_values) == len(prop_names), msg
         logger.debug("Validated vertex %s", vertex.type)
 
     edge_types = set()
@@ -63,9 +65,9 @@ def validate(import_config: ImportConfig):
             prop.name for prop_group in src_vertex.property_groups for prop in prop_group.properties
         ]:
             msg = (
-                f"Source property `{edge.src_prop}` "
-                f"not found in source vertex `{edge.src_type}` "
-                f"in edge `{edge.edge_type}`"
+                f"Source property '{edge.src_prop}' "
+                f"not found in source vertex '{edge.src_type}' "
+                f"in edge '{edge.edge_type}'"
             )
             raise ValueError(msg)
         dst_vertex = next(
@@ -77,25 +79,27 @@ def validate(import_config: ImportConfig):
             prop.name for prop_group in dst_vertex.property_groups for prop in prop_group.properties
         ]:
             msg = (
-                f"Destination property `{edge.dst_prop}` "
-                f"not found in destination vertex `{edge.dst_type}` "
-                f"in edge `{edge.edge_type}`"
+                f"Destination property '{edge.dst_prop}' "
+                f"not found in destination vertex '{edge.dst_type}' "
+                f"in edge '{edge.edge_type}'"
             )
             raise ValueError(msg)
         prop_names = set()
         for prop_group in edge.property_groups:
             for prop in prop_group.properties:
                 if prop.name in prop_names:
-                    msg = f"Duplicate property `{prop.name}` in edge `{edge.edge_type}`"
+                    msg = f"Duplicate property '{prop.name}' in edge '{edge.edge_type}'"
                     raise ValueError(msg)
                 prop_names.add(prop.name)
 
-        source_keys = [key for source in edge.sources for key in source.columns]
+        source_values = [value for source in edge.sources for value in source.columns.values()]
         for prop_name in prop_names:
-            if prop_name not in source_keys:
+            if prop_name not in source_values:
                 msg = (
-                    f"Property `{prop_name}` in edge `{edge.edge_type}` "
+                    f"Property '{prop_name}' in edge "
+                    f"'{edge.dst_prop}_{edge.edge_type}_{edge.edge_type}' "
                     f"not found in source columns"
                 )
                 raise ValueError(msg)
+        # TODO: Validate source columns
         logger.debug("Validated edge %s %s %s", edge.src_type, edge.edge_type, edge.dst_type)
diff --git a/cli/src/importer.h b/cli/src/importer.h
index d1672ee99..4e70826c4 100644
--- a/cli/src/importer.h
+++ b/cli/src/importer.h
@@ -1,9 +1,18 @@
+#pragma once
+
 #include <filesystem>
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <arrow/api.h>
+#include <graphar/api/arrow_writer.h>
 #include <graphar/api/high_level_writer.h>
+#include <graphar/convert_to_arrow_type.h>
+#include <graphar/graph_info.h>
+#include <graphar/high-level/graph_reader.h>
+
+#include "util.h"
 
 namespace py = pybind11;
 namespace fs = std::filesystem;
@@ -30,13 +39,14 @@ struct Source {
   std::string file_type;
   std::string path;
   char delimiter;
-  std::map<std::string, std::string> columns;
+  std::unordered_map<std::string, std::string> columns;
 };
 
 struct Vertex {
   std::string type;
   std::vector<std::string> labels;
   int chunk_size;
+  std::string validate_level;
   std::string prefix;
   std::vector<PropertyGroup> property_groups;
   std::vector<Source> sources;
@@ -56,6 +66,7 @@ struct Edge {
   std::string dst_prop;
   std::vector<std::string> labels;
   int chunk_size;
+  std::string validate_level;
   std::string prefix;
   std::vector<AdjList> adj_lists;
   std::vector<PropertyGroup> property_groups;
@@ -89,6 +100,7 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
     vertex.type = vertex_dict["type"].cast<std::string>();
     vertex.chunk_size = vertex_dict["chunk_size"].cast<int>();
     vertex.prefix = vertex_dict["prefix"].cast<std::string>();
+    vertex.validate_level = vertex_dict["validate_level"].cast<std::string>();
     vertex.labels = vertex_dict["labels"].cast<std::vector<std::string>>();
 
     auto pg_list = vertex_dict["property_groups"].cast<std::vector<py::dict>>();
@@ -103,9 +115,9 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
         prop.data_type = prop_dict["data_type"].cast<std::string>();
         prop.is_primary = prop_dict["is_primary"].cast<bool>();
         prop.nullable = prop_dict["nullable"].cast<bool>();
-        pg.properties.push_back(prop);
+        pg.properties.emplace_back(prop);
       }
-      vertex.property_groups.push_back(pg);
+      vertex.property_groups.emplace_back(pg);
     }
 
     auto source_list = vertex_dict["sources"].cast<std::vector<py::dict>>();
@@ -114,13 +126,13 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
       src.file_type = source_dict["file_type"].cast<std::string>();
       src.path = source_dict["path"].cast<std::string>();
       src.delimiter = source_dict["delimiter"].cast<char>();
-      src.columns =
-          source_dict["columns"].cast<std::map<std::string, std::string>>();
+      src.columns = source_dict["columns"]
+                        .cast<std::unordered_map<std::string, std::string>>();
 
-      vertex.sources.push_back(src);
+      vertex.sources.emplace_back(src);
     }
 
-    import_config.import_schema.vertices.push_back(vertex);
+    import_config.import_schema.vertices.emplace_back(vertex);
   }
 
   auto edges_list = schema_dict["edges"].cast<std::vector<py::dict>>();
@@ -133,6 +145,7 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
     edge.dst_prop = edge_dict["dst_prop"].cast<std::string>();
     edge.labels = edge_dict["labels"].cast<std::vector<std::string>>();
     edge.chunk_size = edge_dict["chunk_size"].cast<int>();
+    edge.validate_level = edge_dict["validate_level"].cast<std::string>();
     edge.prefix = edge_dict["prefix"].cast<std::string>();
 
     auto adj_list_dicts = edge_dict["adj_lists"].cast<std::vector<py::dict>>();
@@ -141,7 +154,7 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
       adj_list.ordered = adj_list_dict["ordered"].cast<bool>();
       adj_list.aligned_by = adj_list_dict["aligned_by"].cast<std::string>();
       adj_list.file_type = adj_list_dict["file_type"].cast<std::string>();
-      edge.adj_lists.push_back(adj_list);
+      edge.adj_lists.emplace_back(adj_list);
     }
 
     auto edge_pg_list =
@@ -158,10 +171,10 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
         edge_prop.data_type = prop_dict["data_type"].cast<std::string>();
         edge_prop.is_primary = prop_dict["is_primary"].cast<bool>();
         edge_prop.nullable = prop_dict["nullable"].cast<bool>();
-        edge_pg.properties.push_back(edge_prop);
+        edge_pg.properties.emplace_back(edge_prop);
       }
 
-      edge.property_groups.push_back(edge_pg);
+      edge.property_groups.emplace_back(edge_pg);
     }
 
     auto edge_source_list = edge_dict["sources"].cast<std::vector<py::dict>>();
@@ -170,13 +183,14 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
       edge_src.file_type = edge_source_dict["file_type"].cast<std::string>();
       edge_src.path = edge_source_dict["path"].cast<std::string>();
       edge_src.delimiter = edge_source_dict["delimiter"].cast<char>();
-      edge_src.columns = edge_source_dict["columns"]
-                             .cast<std::map<std::string, std::string>>();
+      edge_src.columns =
+          edge_source_dict["columns"]
+              .cast<std::unordered_map<std::string, std::string>>();
 
-      edge.sources.push_back(edge_src);
+      edge.sources.emplace_back(edge_src);
     }
 
-    import_config.import_schema.edges.push_back(edge);
+    import_config.import_schema.edges.emplace_back(edge);
   }
 
   return import_config;
@@ -189,29 +203,286 @@ std::string DoImport(const py::dict& config_dict) {
       graphar::InfoVersion::Parse(import_config.graphar_config.version).value();
   fs::path save_path = import_config.graphar_config.path;
 
+  std::unordered_map<std::string, graphar::IdType> vertex_chunk_sizes;
+  std::unordered_map<std::string, int64_t> vertex_counts;
+
+  std::map<std::pair<std::string, std::string>,
+           std::unordered_map<std::shared_ptr<arrow::Scalar>, graphar::IdType,
+                              arrow::Scalar::Hash, arrow::Scalar::PtrsEqual>>
+      vertex_prop_index_map;
+
+  std::unordered_map<std::string, std::vector<std::string>>
+      vertex_props_in_edges;
+  std::map<std::pair<std::string, std::string>, graphar::Property>
+      vertex_prop_property_map;
+  for (const auto& edge : import_config.import_schema.edges) {
+    vertex_props_in_edges[edge.src_type].emplace_back(edge.src_prop);
+    vertex_props_in_edges[edge.dst_type].emplace_back(edge.dst_prop);
+  }
   for (const auto& vertex : import_config.import_schema.vertices) {
-    auto pgs = std::vector<std::shared_ptr<graphar::PropertyGroup>>();
+    vertex_chunk_sizes[vertex.type] = vertex.chunk_size;
 
+    auto pgs = std::vector<std::shared_ptr<graphar::PropertyGroup>>();
+    std::string primary_key;
     for (const auto& pg : vertex.property_groups) {
       std::vector<graphar::Property> props;
       for (const auto& prop : pg.properties) {
-        props.push_back(graphar::Property(
+        if (prop.is_primary) {
+          if (!primary_key.empty()) {
+            throw std::runtime_error("Multiple primary keys found in vertex " +
+                                     vertex.type);
+          }
+          primary_key = prop.name;
+        }
+        graphar::Property property(
             prop.name, graphar::DataType::TypeNameToDataType(prop.data_type),
-            prop.is_primary, prop.nullable));
+            prop.is_primary, prop.nullable);
+        props.emplace_back(property);
+        vertex_prop_property_map[std::make_pair(vertex.type, prop.name)] =
+            property;
       }
       // TODO: add prefix parameter in config
       auto property_group = graphar::CreatePropertyGroup(
           props, graphar::StringToFileType(pg.file_type));
-      pgs.push_back(property_group);
+      pgs.emplace_back(property_group);
     }
 
     auto vertex_info = graphar::CreateVertexInfo(vertex.type, vertex.chunk_size,
                                                  pgs, vertex.prefix, version);
     auto file_name = vertex.type + ".vertex.yml";
     vertex_info->Save(save_path / file_name);
+    auto save_path_str = save_path.string();
+    save_path_str += "/";
+    auto vertex_prop_writer = graphar::VertexPropertyWriter::Make(
+                                  vertex_info, save_path_str,
+                                  StringToValidateLevel(vertex.validate_level))
+                                  .value();
+
+    std::vector<std::shared_ptr<arrow::Table>> vertex_tables;
+    for (const auto& source : vertex.sources) {
+      std::vector<std::string> column_names;
+      for (const auto& [key, value] : source.columns) {
+        column_names.emplace_back(key);
+      }
+      auto table = GetDataFromFile(source.path, column_names, source.delimiter,
+                                   source.file_type);
+
+      std::unordered_map<std::string, Property> column_prop_map;
+      std::unordered_map<std::string, std::string> reversed_columns_config;
+      for (const auto& [key, value] : source.columns) {
+        reversed_columns_config[value] = key;
+      }
+      for (const auto& pg : vertex.property_groups) {
+        for (const auto& prop : pg.properties) {
+          column_prop_map[reversed_columns_config[prop.name]] = prop;
+        }
+      }
+      std::unordered_map<
+          std::string, std::pair<std::string, std::shared_ptr<arrow::DataType>>>
+          columns_to_change;
+      for (const auto& [column, prop] : column_prop_map) {
+        auto arrow_data_type = graphar::DataType::DataTypeToArrowDataType(
+            graphar::DataType::TypeNameToDataType(prop.data_type));
+        auto arrow_column = table->GetColumnByName(column);
+        // TODO: whether need to check duplicate values for primary key?
+        if (!prop.nullable) {
+          for (const auto& chunk : arrow_column->chunks()) {
+            if (chunk->null_count() > 0) {
+              throw std::runtime_error("Non-nullable column '" + column +
+                                       "' has null values");
+            }
+          }
+        }
+        // TODO: check this
+        if (column != prop.name ||
+            arrow_column->type()->id() != arrow_data_type->id()) {
+          columns_to_change[column] =
+              std::make_pair(prop.name, arrow_data_type);
+        }
+      }
+      table = ChangeNameAndDataType(table, columns_to_change);
+      vertex_tables.emplace_back(table);
+    }
+    std::shared_ptr<arrow::Table> merged_vertex_table =
+        MergeTables(vertex_tables);
+    // TODO: check all fields in props
     // TODO: add start_index in config
-    graphar::IdType start_index = 0;
+    graphar::IdType start_chunk_index = 0;
+
+    auto vertex_table_with_index =
+        vertex_prop_writer
+            ->addIndexColumn(merged_vertex_table, start_chunk_index,
+                             vertex_info->GetChunkSize())
+            .value();
+    for (const auto& property_group : pgs) {
+      vertex_prop_writer->WriteTable(vertex_table_with_index, property_group,
+                                     start_chunk_index);
+    }
+    if (vertex_props_in_edges.find(vertex.type) !=
+        vertex_props_in_edges.end()) {
+      for (const auto& vertex_prop : vertex_props_in_edges[vertex.type]) {
+        vertex_prop_index_map[std::make_pair(vertex.type, vertex_prop)] =
+            TableToUnorderedMap(vertex_table_with_index, vertex_prop,
+                                graphar::GeneralParams::kVertexIndexCol);
+      }
+    }
+    auto vertex_count = merged_vertex_table->num_rows();
+    vertex_counts[vertex.type] = vertex_count;
+    vertex_prop_writer->WriteVerticesNum(vertex_count);
   }
 
-  return "Importing...";
+  for (const auto& edge : import_config.import_schema.edges) {
+    auto pgs = std::vector<std::shared_ptr<graphar::PropertyGroup>>();
+
+    for (const auto& pg : edge.property_groups) {
+      std::vector<graphar::Property> props;
+      for (const auto& prop : pg.properties) {
+        props.emplace_back(graphar::Property(
+            prop.name, graphar::DataType::TypeNameToDataType(prop.data_type),
+            prop.is_primary, prop.nullable));
+      }
+      // TODO: add prefix parameter in config
+      auto property_group = graphar::CreatePropertyGroup(
+          props, graphar::StringToFileType(pg.file_type));
+      pgs.emplace_back(property_group);
+    }
+    graphar::AdjacentListVector adj_lists;
+    for (const auto& adj_list : edge.adj_lists) {
+      // TODO: add prefix parameter in config
+      adj_lists.emplace_back(graphar::CreateAdjacentList(
+          graphar::OrderedAlignedToAdjListType(adj_list.ordered,
+                                               adj_list.aligned_by),
+          graphar::StringToFileType(adj_list.file_type)));
+    }
+
+    // TODO: add directed parameter in config
+
+    bool directed = true;
+    // TODO: whether prefix has default value?
+
+    auto edge_info = graphar::CreateEdgeInfo(
+        edge.src_type, edge.edge_type, edge.dst_type, edge.chunk_size,
+        vertex_chunk_sizes[edge.src_type], vertex_chunk_sizes[edge.dst_type],
+        directed, adj_lists, pgs, edge.prefix, version);
+    auto file_name =
+        ConcatEdgeTriple(edge.src_type, edge.edge_type, edge.dst_type) +
+        ".edge.yml";
+    edge_info->Save(save_path / file_name);
+    auto save_path_str = save_path.string();
+    save_path_str += "/";
+    for (const auto& adj_list : adj_lists) {
+      int64_t vertex_count;
+      if (adj_list->GetType() == graphar::AdjListType::ordered_by_source ||
+          adj_list->GetType() == graphar::AdjListType::unordered_by_source) {
+        vertex_count = vertex_counts[edge.src_type];
+      } else {
+        vertex_count = vertex_counts[edge.dst_type];
+      }
+      std::vector<std::shared_ptr<arrow::Table>> edge_tables;
+
+      for (const auto& source : edge.sources) {
+        std::vector<std::string> column_names;
+        for (const auto& [key, value] : source.columns) {
+          column_names.emplace_back(key);
+        }
+        auto table = GetDataFromFile(source.path, column_names,
+                                     source.delimiter, source.file_type);
+        std::unordered_map<std::string, graphar::Property> column_prop_map;
+        std::unordered_map<std::string, std::string> reversed_columns;
+        for (const auto& [key, value] : source.columns) {
+          reversed_columns[value] = key;
+        }
+
+        for (const auto& pg : edge.property_groups) {
+          for (const auto& prop : pg.properties) {
+            column_prop_map[reversed_columns[prop.name]] = graphar::Property(
+                prop.name,
+                graphar::DataType::TypeNameToDataType(prop.data_type),
+                prop.is_primary, prop.nullable);
+          }
+        }
+        column_prop_map[reversed_columns.at(edge.src_prop)] =
+            vertex_prop_property_map.at(
+                std::make_pair(edge.src_type, edge.src_prop));
+        column_prop_map[reversed_columns.at(edge.dst_prop)] =
+            vertex_prop_property_map.at(
+                std::make_pair(edge.dst_type, edge.dst_prop));
+        std::unordered_map<
+            std::string,
+            std::pair<std::string, std::shared_ptr<arrow::DataType>>>
+            columns_to_change;
+        for (const auto& [column, prop] : column_prop_map) {
+          auto arrow_data_type =
+              graphar::DataType::DataTypeToArrowDataType(prop.type);
+          auto arrow_column = table->GetColumnByName(column);
+          // TODO: is needed?
+          if (!prop.is_nullable) {
+            for (const auto& chunk : arrow_column->chunks()) {
+              if (chunk->null_count() > 0) {
+                throw std::runtime_error("Non-nullable column '" + column +
+                                         "' has null values");
+              }
+            }
+          }
+          if (column != prop.name ||
+              arrow_column->type()->id() != arrow_data_type->id()) {
+            columns_to_change[column] =
+                std::make_pair(prop.name, arrow_data_type);
+          }
+        }
+        table = ChangeNameAndDataType(table, columns_to_change);
+        edge_tables.emplace_back(table);
+      }
+      std::unordered_map<
+          std::string, std::pair<std::string, std::shared_ptr<arrow::DataType>>>
+          vertex_columns_to_change;
+
+      std::shared_ptr<arrow::Table> merged_edge_table =
+          MergeTables(edge_tables);
+      // TODO: check all fields in props
+
+      auto combined_edge_table =
+          merged_edge_table->CombineChunks().ValueOrDie();
+
+      auto edge_builder =
+          graphar::builder::EdgesBuilder::Make(
+              edge_info, save_path_str, adj_list->GetType(), vertex_count,
+              StringToValidateLevel(edge.validate_level))
+              .value();
+
+      std::vector<std::string> edge_column_names;
+      for (const auto& field : combined_edge_table->schema()->fields()) {
+        edge_column_names.push_back(field->name());
+      }
+      const int64_t num_rows = combined_edge_table->num_rows();
+      for (int64_t i = 0; i < num_rows; ++i) {
+        auto edge_src_column =
+            combined_edge_table->GetColumnByName(edge.src_prop);
+        auto edge_dst_column =
+            combined_edge_table->GetColumnByName(edge.dst_prop);
+
+        graphar::builder::Edge e(
+            vertex_prop_index_map
+                .at(std::make_pair(edge.src_type, edge.src_prop))
+                .at(edge_src_column->GetScalar(i).ValueOrDie()),
+            vertex_prop_index_map
+                .at(std::make_pair(edge.dst_type, edge.dst_prop))
+                .at(edge_dst_column->GetScalar(i).ValueOrDie()));
+        for (const auto& column_name : edge_column_names) {
+          if (column_name != edge.src_prop && column_name != edge.dst_prop) {
+            auto column = combined_edge_table->GetColumnByName(column_name);
+            auto column_type = column->type();
+            std::any value;
+            TryToCastToAny(
+                graphar::DataType::ArrowDataTypeToDataType(column_type),
+                column->chunk(0), value);
+            e.AddProperty(column_name, value);
+          }
+        }
+        edge_builder->AddEdge(e);
+      }
+      edge_builder->Dump();
+    }
+  }
+  return "Imported successfully!";
 }
\ No newline at end of file
diff --git a/cli/src/util.h b/cli/src/util.h
new file mode 100644
index 000000000..d67d78d80
--- /dev/null
+++ b/cli/src/util.h
@@ -0,0 +1,464 @@
+#pragma once
+
+#include <arrow/adapters/orc/adapter.h>
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+#include <arrow/csv/api.h>
+#include <arrow/io/api.h>
+#include <arrow/json/api.h>
+#include <graphar/api/arrow_writer.h>
+#include <graphar/api/high_level_writer.h>
+#include <graphar/graph_info.h>
+#include <parquet/arrow/reader.h>
+
+#include <iostream>
+
+std::string ConcatEdgeTriple(const std::string& src_type,
+                             const std::string& edge_type,
+                             const std::string& dst_type) {
+  return src_type + REGULAR_SEPARATOR + edge_type + REGULAR_SEPARATOR +
+         dst_type;
+}
+
+graphar::ValidateLevel StringToValidateLevel(const std::string& level) {
+  if (level == "no") {
+    return graphar::ValidateLevel::no_validate;
+  } else if (level == "weak") {
+    return graphar::ValidateLevel::weak_validate;
+  } else if (level == "strong") {
+    return graphar::ValidateLevel::strong_validate;
+  } else {
+    throw std::runtime_error("Invalid validate level: " + level);
+  }
+}
+
+// Utility function to filter the columns from a table
+std::shared_ptr<arrow::Table> SelectColumns(
+    const std::shared_ptr<arrow::Table>& table,
+    const std::vector<std::string>& column_names) {
+  if (column_names.empty()) {
+    throw std::runtime_error("No column names provided.");
+  }
+  std::vector<int> indices;
+  for (const auto& name : column_names) {
+    auto column_index = table->schema()->GetFieldIndex(name);
+    if (column_index != -1) {
+      indices.push_back(column_index);
+    }
+  }
+
+  if (indices.empty()) {
+    throw std::runtime_error("None of the column names matched the schema.");
+  }
+
+  return table->SelectColumns(indices).ValueOrDie();
+}
+
+std::shared_ptr<arrow::Table> GetDataFromParquetFile(
+    const std::string& path, const std::vector<std::string>& column_names) {
+  // Open the Parquet file
+  auto infile =
+      arrow::io::ReadableFile::Open(path, arrow::default_memory_pool())
+          .ValueOrDie();
+
+  // Create a Parquet FileReader
+  std::unique_ptr<parquet::arrow::FileReader> parquet_reader;
+  auto status = parquet::arrow::OpenFile(infile, arrow::default_memory_pool(),
+                                         &parquet_reader);
+  if (!status.ok()) {
+    throw std::runtime_error("Failed to create Parquet FileReader: " +
+                             status.ToString());
+  }
+
+  // Retrieve the Arrow schema from the Parquet file
+  std::shared_ptr<arrow::Schema> schema;
+  status = parquet_reader->GetSchema(&schema);
+  if (!status.ok()) {
+    throw std::runtime_error("Failed to retrieve schema from Parquet file: " +
+                             status.ToString());
+  }
+
+  // Map column names to their indices in the schema
+  std::vector<int> column_indices;
+  for (const auto& col_name : column_names) {
+    int64_t index = schema->GetFieldIndex(col_name);
+    if (index == -1) {
+      throw std::runtime_error("Column not found in schema: " + col_name);
+    }
+    column_indices.push_back(index);
+  }
+
+  // Read the table with the selected columns
+  std::shared_ptr<arrow::Table> table;
+  status = parquet_reader->ReadTable(column_indices, &table);
+  if (!status.ok()) {
+    throw std::runtime_error("Failed to read table from Parquet file: " +
+                             status.ToString());
+  }
+
+  return table;
+}
+
+std::shared_ptr<arrow::Table> GetDataFromCsvFile(
+    const std::string& path, const std::vector<std::string>& column_names,
+    const char delimiter) {
+  // Open the CSV file
+  auto input_result =
+      arrow::io::ReadableFile::Open(path, arrow::default_memory_pool());
+  if (!input_result.ok()) {
+    throw std::runtime_error("Failed to open CSV file: " +
+                             input_result.status().ToString());
+  }
+  std::shared_ptr<arrow::io::ReadableFile> input = input_result.ValueOrDie();
+
+  // Define CSV parse options with the specified delimiter
+  arrow::csv::ParseOptions parse_options = arrow::csv::ParseOptions::Defaults();
+  parse_options.delimiter = delimiter;
+
+  // Define CSV convert options to include only the specified columns
+  arrow::csv::ConvertOptions convert_options =
+      arrow::csv::ConvertOptions::Defaults();
+  convert_options.include_columns = column_names;
+
+  // Optional: Define CSV read options (using defaults here)
+  arrow::csv::ReadOptions read_options = arrow::csv::ReadOptions::Defaults();
+
+  // Create a CSV TableReader using  IOContext
+  arrow::io::IOContext io_context(arrow::default_memory_pool());
+  arrow::Result<std::shared_ptr<arrow::csv::TableReader>> reader_result =
+      arrow::csv::TableReader::Make(io_context, input, read_options,
+                                    parse_options, convert_options);
+
+  if (!reader_result.ok()) {
+    throw std::runtime_error("Failed to create CSV TableReader: " +
+                             reader_result.status().ToString());
+  }
+  std::shared_ptr<arrow::csv::TableReader> reader = reader_result.ValueOrDie();
+
+  // Read the table
+  arrow::Result<std::shared_ptr<arrow::Table>> table_result = reader->Read();
+  if (!table_result.ok()) {
+    throw std::runtime_error("Failed to read table from CSV file: " +
+                             table_result.status().ToString());
+  }
+  std::shared_ptr<arrow::Table> table = table_result.ValueOrDie();
+
+  // Optional: Validate that all requested columns are present
+  auto schema = table->schema();
+  for (const auto& col_name : column_names) {
+    if (schema->GetFieldByName(col_name) == nullptr) {
+      throw std::runtime_error("Column not found in CSV file: " + col_name);
+    }
+  }
+
+  return table;
+}
+
+std::shared_ptr<arrow::Table> GetDataFromOrcFile(
+    const std::string& path, const std::vector<std::string>& column_names) {
+  // Open the ORC file
+  auto infile =
+      arrow::io::ReadableFile::Open(path, arrow::default_memory_pool())
+          .ValueOrDie();
+
+  // Create an ORC file reader
+  std::unique_ptr<arrow::adapters::orc::ORCFileReader> orc_reader =
+      arrow::adapters::orc::ORCFileReader::Open(infile,
+                                                arrow::default_memory_pool())
+          .ValueOrDie();
+
+  // Read the table with the selected columns
+  arrow::Result<std::shared_ptr<arrow::Table>> table_result =
+      orc_reader->Read(column_names);
+  if (!table_result.ok()) {
+    throw std::runtime_error("Failed to read table from ORC file: " +
+                             table_result.status().ToString());
+  }
+  std::shared_ptr<arrow::Table> table = table_result.ValueOrDie();
+
+  // Optional: Validate that all requested columns are present
+  auto schema = table->schema();
+  for (const auto& col_name : column_names) {
+    if (schema->GetFieldByName(col_name) == nullptr) {
+      throw std::runtime_error("Column not found in ORC file: " + col_name);
+    }
+  }
+
+  return table;
+}
+
+std::shared_ptr<arrow::Table> GetDataFromJsonFile(
+    const std::string& path, const std::vector<std::string>& column_names) {
+  //  Open the JSON file
+  auto infile =
+      arrow::io::ReadableFile::Open(path, arrow::default_memory_pool())
+          .ValueOrDie();
+
+  // Define JSON read options (using defaults here)
+  arrow::json::ReadOptions read_options = arrow::json::ReadOptions::Defaults();
+
+  // Define JSON parse options (using defaults here)
+  arrow::json::ParseOptions parse_options =
+      arrow::json::ParseOptions::Defaults();
+
+  // Create a JSON TableReader
+  std::shared_ptr<arrow::json::TableReader> json_reader =
+      arrow::json::TableReader::Make(arrow::default_memory_pool(), infile,
+                                     arrow::json::ReadOptions::Defaults(),
+                                     arrow::json::ParseOptions::Defaults())
+          .ValueOrDie();
+
+  // Read the table
+  arrow::Result<std::shared_ptr<arrow::Table>> table_result =
+      json_reader->Read();
+  if (!table_result.ok()) {
+    throw std::runtime_error("Failed to read table from ORC file: " +
+                             table_result.status().ToString());
+  }
+  std::shared_ptr<arrow::Table> table = table_result.ValueOrDie();
+
+  table = SelectColumns(table, column_names);
+
+  // Optional: Validate that all requested columns are present
+  // TODO: must be equal
+  auto schema = table->schema();
+  for (const auto& col_name : column_names) {
+    if (schema->GetFieldByName(col_name) == nullptr) {
+      throw std::runtime_error("Column not found in JSON file: " + col_name);
+    }
+  }
+
+  return table;
+}
+
+std::shared_ptr<arrow::Table> GetDataFromFile(
+    const std::string& path, const std::vector<std::string>& column_names,
+    const char& delimiter, const std::string& file_type) {
+  // TODO: use explicit schema
+  if (file_type == "parquet") {
+    return GetDataFromParquetFile(path, column_names);
+  } else if (file_type == "csv") {
+    return GetDataFromCsvFile(path, column_names, delimiter);
+  } else if (file_type == "orc") {
+    return GetDataFromOrcFile(path, column_names);
+  } else if (file_type == "json") {
+    return GetDataFromJsonFile(path, column_names);
+  } else {
+    throw std::runtime_error("Unsupported file type: " + file_type);
+  }
+}
+
+std::shared_ptr<arrow::Table> ChangeNameAndDataType(
+    const std::shared_ptr<arrow::Table>& table,
+    const std::unordered_map<
+        std::string, std::pair<std::string, std::shared_ptr<arrow::DataType>>>&
+        columns_to_change) {
+  // Retrieve original schema and number of columns
+  auto original_schema = table->schema();
+  int64_t num_columns = table->num_columns();
+
+  // Prepare vectors for new schema fields and new column data
+  std::vector<std::shared_ptr<arrow::Field>> new_fields;
+  std::vector<std::shared_ptr<arrow::ChunkedArray>> new_columns;
+
+  for (int64_t i = 0; i < num_columns; ++i) {
+    auto original_field = original_schema->field(i);
+    auto original_column = table->column(i);  // This is a ChunkedArray
+
+    std::string original_name = original_field->name();
+    std::shared_ptr<arrow::DataType> original_type = original_field->type();
+
+    // Check if this column needs to be changed
+    auto it = columns_to_change.find(original_name);
+    if (it != columns_to_change.end()) {
+      std::string new_name = it->second.first;
+      std::shared_ptr<arrow::DataType> new_type = it->second.second;
+
+      bool name_changed = (new_name != original_name);
+      bool type_changed = !original_type->Equals(*new_type);
+
+      std::shared_ptr<arrow::ChunkedArray> new_chunked_array;
+
+      // If data type needs to be changed, cast each chunk
+      if (type_changed) {
+        std::vector<std::shared_ptr<arrow::Array>> casted_chunks;
+        for (const auto& chunk : original_column->chunks()) {
+          // Perform type casting using Compute API
+          arrow::compute::CastOptions cast_options;
+          cast_options.allow_int_overflow = false;  // Set as needed
+
+          auto cast_result =
+              arrow::compute::Cast(*chunk, new_type, cast_options);
+          if (!cast_result.ok()) {
+            throw std::runtime_error("Failed to cast column data.");
+          }
+          casted_chunks.push_back(cast_result.ValueOrDie());
+        }
+        // Create a new ChunkedArray with casted chunks
+        new_chunked_array =
+            std::make_shared<arrow::ChunkedArray>(casted_chunks, new_type);
+      } else {
+        // If type is not changed, keep the original column
+        new_chunked_array = original_column;
+      }
+
+      // Create a new Field with the updated name and type
+      auto new_field =
+          arrow::field(new_name, type_changed ? new_type : original_type,
+                       original_field->nullable());
+      new_fields.push_back(new_field);
+      new_columns.push_back(new_chunked_array);
+    } else {
+      // Columns not in the change map remain unchanged
+      new_fields.push_back(original_field);
+      new_columns.push_back(original_column);
+    }
+  }
+
+  // Create the new schema
+  auto new_schema = arrow::schema(new_fields);
+
+  // Construct the new table with updated schema and columns
+  auto new_table = arrow::Table::Make(new_schema, new_columns);
+
+  return new_table;
+}
+
+std::shared_ptr<arrow::Table> MergeTables(
+    const std::vector<std::shared_ptr<arrow::Table>>& tables) {
+  // Check if tables vector is not empty
+  if (tables.empty()) {
+    throw std::runtime_error("No tables to merge.");
+  }
+
+  // Check if all tables have the same number of rows
+  int64_t num_rows = tables[0]->num_rows();
+  for (const auto& table : tables) {
+    if (table->num_rows() != num_rows) {
+      throw std::runtime_error("All tables must have the same number of rows.");
+    }
+  }
+
+  // Prepare a vector to hold all the columns from the input tables
+  std::vector<std::shared_ptr<arrow::Field>> fields;
+  std::vector<std::shared_ptr<arrow::ChunkedArray>> columns;
+
+  for (const auto& table : tables) {
+    for (int64_t i = 0; i < table->num_columns(); ++i) {
+      fields.push_back(table->schema()->field(i));
+      columns.push_back(table->column(i));
+    }
+  }
+
+  // Create a new schema and table with merged columns
+  auto merged_schema = std::make_shared<arrow::Schema>(fields);
+  auto merged_table = arrow::Table::Make(merged_schema, columns, num_rows);
+
+  return merged_table;
+}
+
+std::unordered_map<std::shared_ptr<arrow::Scalar>, graphar::IdType,
+                   arrow::Scalar::Hash, arrow::Scalar::PtrsEqual>
+TableToUnorderedMap(const std::shared_ptr<arrow::Table>& table,
+                    const std::string& key_column_name,
+                    const std::string& value_column_name) {
+  auto combined_table = table->CombineChunks().ValueOrDie();
+  // Get the column indices
+  auto key_column_idx =
+      combined_table->schema()->GetFieldIndex(key_column_name);
+  auto value_column_idx =
+      combined_table->schema()->GetFieldIndex(value_column_name);
+  if (key_column_idx == -1) {
+    throw std::runtime_error("Key column '" + key_column_name +
+                             "' not found in the table.");
+  }
+  if (value_column_idx == -1) {
+    throw std::runtime_error("Value column '" + value_column_name +
+                             "' not found in the table.");
+  }
+
+  // Extract the columns
+  auto key_column = combined_table->column(key_column_idx);
+  auto value_column = combined_table->column(value_column_idx);
+
+  std::unordered_map<std::shared_ptr<arrow::Scalar>, graphar::IdType,
+                     arrow::Scalar::Hash, arrow::Scalar::PtrsEqual>
+      result;
+
+  // Ensure both columns have the same length
+  if (key_column->length() != value_column->length()) {
+    throw std::runtime_error("Key and value columns have different lengths.");
+  }
+
+  // Iterate over each row and populate the map
+  for (int64_t i = 0; i < key_column->length(); ++i) {
+    auto key_column_chunk = key_column->chunk(0);
+    auto value_column_chunk = value_column->chunk(0);
+    // Check for nulls
+    if (key_column_chunk->IsNull(i)) {
+      throw std::runtime_error("Null key value at index " + std::to_string(i) +
+                               " in " + key_column_name);
+    }
+    if (value_column_chunk->IsNull(i)) {
+      throw std::runtime_error("Null value at index " + std::to_string(i) +
+                               " in " + value_column_name);
+    }
+
+    // Extract key and value using the helper function
+    auto key = key_column_chunk->GetScalar(i).ValueOrDie();
+    auto value = std::static_pointer_cast<arrow::Int64Scalar>(
+                     value_column_chunk->GetScalar(i).ValueOrDie())
+                     ->value;
+    result.emplace(key, value);
+  }
+
+  return result;
+}
+
+template <graphar::Type type>
+graphar::Status CastToAny(std::shared_ptr<arrow::Array> array, std::any& any,
+                          int64_t index) {  // NOLINT
+  if (array->IsNull(index)) {
+    any = std::any();
+    return graphar::Status::OK();
+  }
+  using ArrayType = typename graphar::TypeToArrowType<type>::ArrayType;
+  auto column = std::dynamic_pointer_cast<ArrayType>(array);
+  any = column->GetView(index);
+  return graphar::Status::OK();
+}
+
+template <>
+graphar::Status CastToAny<graphar::Type::STRING>(
+    std::shared_ptr<arrow::Array> array, std::any& any,
+    int64_t index) {  // NOLINT
+  auto column = std::dynamic_pointer_cast<arrow::LargeStringArray>(array);
+  any = column->GetString(index);
+  return graphar::Status::OK();
+}
+
+graphar::Status TryToCastToAny(const std::shared_ptr<graphar::DataType>& type,
+                               std::shared_ptr<arrow::Array> array,
+                               std::any& any, int64_t index = 0) {  // NOLINT
+  switch (type->id()) {
+  case graphar::Type::BOOL:
+    return CastToAny<graphar::Type::BOOL>(array, any, index);
+  case graphar::Type::INT32:
+    return CastToAny<graphar::Type::INT32>(array, any, index);
+  case graphar::Type::INT64:
+    return CastToAny<graphar::Type::INT64>(array, any, index);
+  case graphar::Type::FLOAT:
+    return CastToAny<graphar::Type::FLOAT>(array, any, index);
+  case graphar::Type::DOUBLE:
+    return CastToAny<graphar::Type::DOUBLE>(array, any, index);
+  case graphar::Type::STRING:
+    return CastToAny<graphar::Type::STRING>(array, any, index);
+  case graphar::Type::DATE:
+    return CastToAny<graphar::Type::DATE>(array, any, index);
+  case graphar::Type::TIMESTAMP:
+    return CastToAny<graphar::Type::TIMESTAMP>(array, any, index);
+  default:
+    return graphar::Status::TypeError("Unsupported type.");
+  }
+  return graphar::Status::OK();
+}
diff --git a/cli/test/merge.py b/cli/test/merge.py
new file mode 100644
index 000000000..1a7b73e1a
--- /dev/null
+++ b/cli/test/merge.py
@@ -0,0 +1,86 @@
+from enum import Enum
+from pathlib import Path
+from typing import List, Optional
+
+import pandas as pd
+import typer
+from typing_extensions import Annotated
+
+app = typer.Typer(no_args_is_help=True, context_settings={"help_option_names": ["-h", "--help"]})
+
+
+support_file_types = {"parquet", "orc", "csv", "json"}
+
+
+class FileType(str, Enum):
+    parquet = "parquet"
+    csv = "csv"
+    orc = "orc"
+    json = "json"
+
+
+@app.command(
+    "merge",
+    context_settings={"help_option_names": ["-h", "--help"]},
+    help="Merge source files",
+    no_args_is_help=True,
+)
+def merge_data(
+    files: Annotated[
+        List[str], typer.Option("--file", "-f", help="Files to merge", show_default=False)
+    ],
+    output_file: Annotated[
+        str, typer.Option("--output", "-o", help="Output file", show_default=False)
+    ],
+    type: Annotated[
+        Optional[FileType], typer.Option("--type", "-t", help="Type of data to output", show_default=False)
+    ] = None,
+):
+    if not files:
+        typer.echo("No files to merge")
+        raise typer.Exit(1)
+    if not output_file:
+        typer.echo("No output file")
+        raise typer.Exit(1)
+    data = []
+    for file in files:
+        path = Path(file)
+        if not path.is_file():
+            typer.echo(f"File {file} not found")
+            raise typer.Exit(1)
+        file_type = path.suffix.removeprefix(".")
+        if file_type == "":
+            typer.echo(f"File {file} has no file type suffix")
+            raise typer.Exit(1)
+        if file_type not in support_file_types:
+            typer.echo(f"File type {file_type} not supported")
+            raise typer.Exit(1)
+        if file_type == "parquet":
+            data.append(pd.read_parquet(file))
+        elif file_type == "csv":
+            data.append(pd.read_csv(file))
+        elif file_type == "orc":
+            data.append(pd.read_orc(file))
+        elif file_type == "json":
+            data.append(pd.read_json(file))
+    output_path = Path(output_file)
+    if output_path.is_file():
+        typer.echo(f"Output file {output_file} already exists")
+        if not typer.prompt("Do you want to overwrite it?", default=False):
+            raise typer.Exit(1)
+    if not type:
+        type = output_path.suffix.removeprefix(".")
+    result = pd.concat(data, ignore_index=True)
+    if type == "parquet":
+        result.to_parquet(output_file)
+    elif type == "csv":
+        result.to_csv(output_file)
+    elif type == "orc":
+        result.to_orc(output_file)
+    elif type == "json":
+        result.to_json(output_file, orient="records", lines=True)
+    typer.echo(f"Data merged to {output_file}")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/cli/test/test_basic.py b/cli/test/test_basic.py
new file mode 100644
index 000000000..9f775b59b
--- /dev/null
+++ b/cli/test/test_basic.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+import graphar_cli as m
+
+
+def test_version():
+    assert m.__version__ == "0.0.1"
diff --git a/cpp/src/graphar/arrow/chunk_writer.h b/cpp/src/graphar/arrow/chunk_writer.h
index 65a8813ef..47edeff2e 100644
--- a/cpp/src/graphar/arrow/chunk_writer.h
+++ b/cpp/src/graphar/arrow/chunk_writer.h
@@ -187,6 +187,10 @@ class VertexPropertyWriter {
       const std::shared_ptr<GraphInfo>& graph_info, const std::string& type,
       const ValidateLevel& validate_level = ValidateLevel::no_validate);
 
+  Result<std::shared_ptr<arrow::Table>> addIndexColumn(
+      const std::shared_ptr<arrow::Table>& table, IdType chunk_index,
+      IdType chunk_size) const;
+
  private:
   /**
    * @brief Check if the operation of writing vertices number is allowed.
@@ -221,10 +225,6 @@ class VertexPropertyWriter {
                   const std::shared_ptr<PropertyGroup>& property_group,
                   IdType chunk_index, ValidateLevel validate_level) const;
 
-  Result<std::shared_ptr<arrow::Table>> addIndexColumn(
-      const std::shared_ptr<arrow::Table>& table, IdType chunk_index,
-      IdType chunk_size) const;
-
  private:
   std::shared_ptr<VertexInfo> vertex_info_;
   std::string prefix_;
diff --git a/cpp/src/graphar/graph_info.h b/cpp/src/graphar/graph_info.h
index f6d086966..875ac1556 100644
--- a/cpp/src/graphar/graph_info.h
+++ b/cpp/src/graphar/graph_info.h
@@ -38,6 +38,8 @@ class Property {
   bool is_primary;                 // primary key tag
   bool is_nullable;                // nullable tag for non-primary key
 
+  Property() = default;
+
   explicit Property(const std::string& name,
                     const std::shared_ptr<DataType>& type = nullptr,
                     bool is_primary = false, bool is_nullable = true)

From 45ab64d296e891b8c5545915267cd4d0c19fa3f8 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 17:30:24 +0800
Subject: [PATCH 07/30] license

---
 cli/CMakeLists.txt                 | 17 +++++++++++++++++
 cli/src/graphar_cli/__init__.py    | 17 +++++++++++++++++
 cli/src/graphar_cli/config.py      | 17 +++++++++++++++++
 cli/src/graphar_cli/graphar_cli.py | 17 +++++++++++++++++
 cli/src/graphar_cli/importer.py    | 17 +++++++++++++++++
 cli/src/graphar_cli/logging.py     | 17 +++++++++++++++++
 cli/src/importer.h                 | 19 +++++++++++++++++++
 cli/src/main.cc                    | 19 +++++++++++++++++++
 cli/src/util.h                     | 19 +++++++++++++++++++
 cli/test/merge.py                  | 17 +++++++++++++++++
 cli/test/test_basic.py             | 17 +++++++++++++++++
 11 files changed, 193 insertions(+)

diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 8f01ae809..b8cb8973e 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # Require CMake 3.15+ (matching scikit-build-core) Use new versions of all
 # policies up to CMake 3.27
 cmake_minimum_required(VERSION 3.15...3.27)
diff --git a/cli/src/graphar_cli/__init__.py b/cli/src/graphar_cli/__init__.py
index 4e7badc5c..e8091abdb 100644
--- a/cli/src/graphar_cli/__init__.py
+++ b/cli/src/graphar_cli/__init__.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from __future__ import annotations
 
 from ._core import __doc__, __version__
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index 3bfdca5df..39e847d95 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from logging import getLogger
 from pathlib import Path
 from typing import Dict, List, Literal, Optional
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index b9ec4f24f..e3400a2c7 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import json
 from logging import getLogger
 from pathlib import Path
diff --git a/cli/src/graphar_cli/importer.py b/cli/src/graphar_cli/importer.py
index 49d577193..ec568849f 100644
--- a/cli/src/graphar_cli/importer.py
+++ b/cli/src/graphar_cli/importer.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from logging import getLogger
 
 from .config import ImportConfig
diff --git a/cli/src/graphar_cli/logging.py b/cli/src/graphar_cli/logging.py
index 3a2b90684..f8ef5b857 100644
--- a/cli/src/graphar_cli/logging.py
+++ b/cli/src/graphar_cli/logging.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import logging
 from typing import Union
 
diff --git a/cli/src/importer.h b/cli/src/importer.h
index 4e70826c4..ef6c093de 100644
--- a/cli/src/importer.h
+++ b/cli/src/importer.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include <filesystem>
diff --git a/cli/src/main.cc b/cli/src/main.cc
index 2ed2e4157..03a02074f 100644
--- a/cli/src/main.cc
+++ b/cli/src/main.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/cli/src/util.h b/cli/src/util.h
index d67d78d80..b7ea1d326 100644
--- a/cli/src/util.h
+++ b/cli/src/util.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include <arrow/adapters/orc/adapter.h>
diff --git a/cli/test/merge.py b/cli/test/merge.py
index 1a7b73e1a..53d7a7049 100644
--- a/cli/test/merge.py
+++ b/cli/test/merge.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from enum import Enum
 from pathlib import Path
 from typing import List, Optional
diff --git a/cli/test/test_basic.py b/cli/test/test_basic.py
index 9f775b59b..44afa889c 100644
--- a/cli/test/test_basic.py
+++ b/cli/test/test_basic.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from __future__ import annotations
 
 import graphar_cli as m

From 3393611585cbc2126edaf4218ca34c6a98185800 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 17:34:26 +0800
Subject: [PATCH 08/30] fix license

---
 licenserc.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/licenserc.toml b/licenserc.toml
index 5353b95ba..33d89581c 100644
--- a/licenserc.toml
+++ b/licenserc.toml
@@ -61,4 +61,6 @@ excludes = [
   "java/build.xml",
   "**/*.json",
   "pyspark/poetry.lock",
+  "cli/*.yml",
+  "cli/*.toml",
 ]

From 305cca592a36fc19f842a9b35166f71cba87ff6a Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 17:36:19 +0800
Subject: [PATCH 09/30] remove conda recipe

---
 cli/conda.recipe/meta.yaml | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 cli/conda.recipe/meta.yaml

diff --git a/cli/conda.recipe/meta.yaml b/cli/conda.recipe/meta.yaml
deleted file mode 100644
index 842a2bf72..000000000
--- a/cli/conda.recipe/meta.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-package:
-  name: graphar_cli
-  version: 0.0.1
-
-source:
-  path: ..
-
-build:
-  number: 0
-  script: {{ PYTHON }} -m pip install . -vv
-
-requirements:
-  build:
-    - python
-    - {{ compiler('cxx') }}
-
-  host:
-    - python
-    - pip
-    - scikit-build-core
-    - pybind11 >=2.10.0
-
-  run:
-    - python
-
-
-about:
-  summary: GraphAR Command Line
-  license_file: LICENSE

From cda248760c4bac87e9d7ae59e4b5f8b7f688cb6a Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 18:00:19 +0800
Subject: [PATCH 10/30] enbale ci

---
 cli/import.mini.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cli/import.mini.yml b/cli/import.mini.yml
index 1b1a2a8f8..5a2fd3944 100644
--- a/cli/import.mini.yml
+++ b/cli/import.mini.yml
@@ -1,5 +1,5 @@
 graphar:
-  path: /workspaces/incubator-graphar/graphar/movie
+  path: /temp/graphar/movie
   name: MovieGraph
 
 import_schema:
@@ -13,7 +13,7 @@ import_schema:
             - name: born
               data_type: int64
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person.parquet
+        - path: ../testing/neo4j/data/Person.parquet
           columns:
             name: name
             born: born
@@ -26,7 +26,7 @@ import_schema:
             - name: tagline
               data_type: string
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.orc
+        - path: ../testing/neo4j/data/Movie.orc
           columns:
             title: title
             tagline: tagline
@@ -37,7 +37,7 @@ import_schema:
       dst_type: Movie
       dst_prop: title
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv
+        - path: ../testing/neo4j/data/Person_WROTE_Movie.csv
           columns:
             name: name
             title: title
@@ -47,7 +47,7 @@ import_schema:
       dst_type: Movie
       dst_prop: title
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.json
+        - path: ../testing/neo4j/data/Person_ACTED_IN_Movie.json
           columns:
             name: name
             title: title
@@ -63,7 +63,7 @@ import_schema:
           - name: summary
             data_type: string
       sources:
-        - path: /workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv
+        - path: ../testing/neo4j/data/Person_REVIEWED_Movie.csv
           columns:
             name: name
             title: title

From 0dc6ff43d567efb7c88af3517dc406ff7ea813a1 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 18:00:43 +0800
Subject: [PATCH 11/30] fix typo

---
 cli/import.mini.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/import.mini.yml b/cli/import.mini.yml
index 5a2fd3944..86cf92f37 100644
--- a/cli/import.mini.yml
+++ b/cli/import.mini.yml
@@ -1,5 +1,5 @@
 graphar:
-  path: /temp/graphar/movie
+  path: /tmp/graphar/movie
   name: MovieGraph
 
 import_schema:

From 8e942be75d668de6d52e3c41b75ee248327bf713 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 18:15:11 +0800
Subject: [PATCH 12/30] fix dependency

---
 cli/src/graphar_cli/graphar_cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index e3400a2c7..cf610ca1d 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -20,7 +20,6 @@
 from pathlib import Path
 from typing import List
 
-import pandas as pd
 import typer
 import yaml
 

From fb684fc29cd7fdda31c4ee955752f8c477386d51 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 18:18:51 +0800
Subject: [PATCH 13/30] update ci

---
 .github/workflows/cli.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index f9d61b793..589d4c043 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -67,8 +67,12 @@ jobs:
     - name: Install GraphAr CLI
       working-directory: "cli"
       run: |
-        pip install ./ -vvv
+        pip install ./ -v
         graphar-cli --help
+        graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
+        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person
+        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+        graphar-cli import -c import.mini.yml
 
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@v4.0.1
@@ -100,7 +104,7 @@ jobs:
     - name: Build GraphAr And Run Tests
       working-directory: "cli"
       run: |
-        pip install ./ -vvv
+        pip install ./
         graphar-cli --help
         graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
         graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person

From 5125d9ca89a8782fc404dff7dc3ebfa87161168b Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 19:48:12 +0800
Subject: [PATCH 14/30] add data

---
 testing | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testing b/testing
index 491fce725..0b22fba6f 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 491fce7259406ff33986229590576589b84230d8
+Subproject commit 0b22fba6f9e0f467b1121feae379e60eb2eb9f99

From 9e35d48a390304e710657eac0ccad6e41ebec228 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 21:24:11 +0800
Subject: [PATCH 15/30] fix ci

---
 .github/workflows/cli.yml     | 2 +-
 cli/CMakeLists.txt            | 5 +++--
 cli/src/graphar_cli/config.py | 7 ++++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index 589d4c043..9a04ae205 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -64,7 +64,7 @@ jobs:
                             libparquet-dev
         sudo apt-get install -y ccache libcurl4-openssl-dev
 
-    - name: Install GraphAr CLI
+    - name: Install GraphAr CLI and Run Tests
       working-directory: "cli"
       run: |
         pip install ./ -v
diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index b8cb8973e..6f7f12fad 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -25,7 +25,7 @@ project(
   ${SKBUILD_PROJECT_NAME}
   VERSION ${SKBUILD_PROJECT_VERSION}
   LANGUAGES CXX)
-
+set(CMAKE_CXX_FLAGS "-g  -std=c++17 -lstdc++fs")
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../cpp ${CMAKE_BINARY_DIR}/graphar)
 
 # Find the module development requirements (requires FindPython from 3.17 or
@@ -36,7 +36,8 @@ find_package(pybind11 CONFIG REQUIRED)
 # Add a library using FindPython's tooling (pybind11 also provides a helper like
 # this)
 python_add_library(_core MODULE src/main.cc WITH_SOABI)
-target_link_libraries(_core PRIVATE pybind11::headers graphar)
+
+target_link_libraries(_core PRIVATE pybind11::headers graphar stdc++fs)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/thirdparty)
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index 39e847d95..dd85175c7 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -47,7 +47,7 @@ class GraphArConfig(BaseModel):
 
     @field_validator("path")
     def check_path(cls, v):
-        path = Path(v)
+        path = Path(v).resolve().absolute()
         if not path.exists():
             path.mkdir(parents=True, exist_ok=True)
         elif any(path.iterdir()):
@@ -94,8 +94,9 @@ class Source(BaseModel):
 
     @field_validator("path")
     def check_path(cls, v):
-        if not Path(v).is_file():
-            msg = f"{v} is not a file."
+        path = Path(v).resolve().absolute()
+        if not path.is_file():
+            msg = f"'{path}' is not a file."
             raise ValueError(msg)
         return v
 

From 63e77ee188a5b4eadec7703ae5f3c0b0b234b79d Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 21:46:34 +0800
Subject: [PATCH 16/30] fix cmake

---
 cli/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 6f7f12fad..90f33afe2 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -25,7 +25,7 @@ project(
   ${SKBUILD_PROJECT_NAME}
   VERSION ${SKBUILD_PROJECT_VERSION}
   LANGUAGES CXX)
-set(CMAKE_CXX_FLAGS "-g  -std=c++17 -lstdc++fs")
+set(CMAKE_CXX_STANDARD 17)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../cpp ${CMAKE_BINARY_DIR}/graphar)
 
 # Find the module development requirements (requires FindPython from 3.17 or
@@ -37,7 +37,7 @@ find_package(pybind11 CONFIG REQUIRED)
 # this)
 python_add_library(_core MODULE src/main.cc WITH_SOABI)
 
-target_link_libraries(_core PRIVATE pybind11::headers graphar stdc++fs)
+target_link_libraries(_core PRIVATE pybind11::headers graphar)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/thirdparty)

From 3b7758af266237ea75be15e3726bf00fd9f8850c Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 21:58:01 +0800
Subject: [PATCH 17/30] add arrow in cmake

---
 cli/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 90f33afe2..7939115a3 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -25,6 +25,7 @@ project(
   ${SKBUILD_PROJECT_NAME}
   VERSION ${SKBUILD_PROJECT_VERSION}
   LANGUAGES CXX)
+
 set(CMAKE_CXX_STANDARD 17)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../cpp ${CMAKE_BINARY_DIR}/graphar)
 
@@ -32,12 +33,13 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../cpp ${CMAKE_BINARY_DIR}/graphar)
 # scikit-build-core's built-in backport)
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
 find_package(pybind11 CONFIG REQUIRED)
+find_package(Arrow REQUIRED)
 
 # Add a library using FindPython's tooling (pybind11 also provides a helper like
 # this)
 python_add_library(_core MODULE src/main.cc WITH_SOABI)
 
-target_link_libraries(_core PRIVATE pybind11::headers graphar)
+target_link_libraries(_core PRIVATE pybind11::headers graphar Arrow::arrow_shared)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/thirdparty)

From 4d722378d05f4d263f9a854ca4518caf32864ae2 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Fri, 25 Oct 2024 22:07:33 +0800
Subject: [PATCH 18/30] add dependency

---
 cli/CMakeLists.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 7939115a3..8f445451d 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -34,12 +34,19 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../cpp ${CMAKE_BINARY_DIR}/graphar)
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
 find_package(pybind11 CONFIG REQUIRED)
 find_package(Arrow REQUIRED)
+find_package(ArrowDataset REQUIRED)
+find_package(ArrowAcero REQUIRED)
+find_package(Parquet REQUIRED)
 
 # Add a library using FindPython's tooling (pybind11 also provides a helper like
 # this)
 python_add_library(_core MODULE src/main.cc WITH_SOABI)
 
-target_link_libraries(_core PRIVATE pybind11::headers graphar Arrow::arrow_shared)
+target_link_libraries(_core PRIVATE pybind11::headers graphar Arrow::arrow_shared
+                                    Parquet::parquet_shared
+                                    ArrowDataset::arrow_dataset_shared
+                                    ArrowAcero::arrow_acero_shared
+                                    )
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/src)
 target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/thirdparty)

From 9fc73539f141b7fec23fe2fcd4e4440ac4ae565c Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Tue, 29 Oct 2024 15:32:53 +0800
Subject: [PATCH 19/30] fix review

---
 .github/workflows/cli.yml      | 12 +++++++-----
 cli/CMakeLists.txt             |  2 +-
 cli/README.md                  | 12 ++++++------
 cli/pyproject.toml             |  6 +++---
 cli/src/graphar_cli/config.py  | 35 +++++++++++++++++-----------------
 cli/src/graphar_cli/logging.py |  2 +-
 cli/src/importer.h             | 17 ++++++++---------
 cli/src/main.cc                | 15 ++++++++-------
 cli/src/util.h                 | 29 ++++++++++++++++------------
 9 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index 9a04ae205..a316ee8f4 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -105,8 +105,10 @@ jobs:
       working-directory: "cli"
       run: |
         pip install ./
-        graphar-cli --help
-        graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
-        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person
-        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-        graphar-cli import -c import.mini.yml
+        graphar --help
+        graphar check -p ../testing/neo4j/MovieGraph.graph.yml
+        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
+        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+        graphar import -c import.mini.yml
+      
+      # TODO: Add unit tests
diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 8f445451d..3639bbf4e 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 # Require CMake 3.15+ (matching scikit-build-core) Use new versions of all
 # policies up to CMake 3.27
-cmake_minimum_required(VERSION 3.15...3.27)
+cmake_minimum_required(VERSION 3.15)
 
 # Scikit-build-core sets these values for you, or you can just hard-code the
 # name and version.
diff --git a/cli/README.md b/cli/README.md
index 369fc1b02..26c8eeacd 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -27,19 +27,19 @@ And using Python in conda or venv is a good choice.
 ## Usage
 
 ```bash
-graphar-cli --help
+graphar --help
 
 # check the metadata, verify whether the vertex edge information and attribute information of the graph are valid
-graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
+graphar check -p ../testing/neo4j/MovieGraph.graph.yml
 
 # show the vertex
-graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person
+graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
 
 # show the edge
-graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
 
 # import graph data by using a config file
-graphar-cli import -c import.mini.yml
+graphar import -c import.mini.yml
 ```
 
 ## Import config file
@@ -57,7 +57,7 @@ In the yaml config files, we provide brief comments on the fields, which can be
 
 **Example**
 
-To import the movie graph data from the `testing` directory, you first need to prepare data files. Supported file types include `csv`, `json`(as well as`jsonline`, but should have the `.json` extension), `parquet`, and `orc` files. Please ensure the correct file extensions are set in advance, or specify the file type in the source section of the configuration.
+To import the movie graph data from the `testing` directory, you first need to prepare data files. Supported file types include `csv`, `json`(as well as`jsonline`, but should have the `.json` extension), `parquet`, and `orc` files. Please ensure the correct file extensions are set in advance, or specify the `file_type` field in the source section of the configuration. The `file_type` field will ignore the file extension.
 
 Next, write a configuration file following the provided sample. Any empty fields in the `graphar` configuration will be filled with default values. In the `import_schema`, empty fields will use the global configuration values from `graphar`. If fields in `import_schema` are not empty, they will override the values from `graphar`.
 
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index 889e3e157..5dda7103b 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -10,13 +10,13 @@ description = "GraphAr command line tool"
 readme = "README.md"
 authors = [{ name = "GraphAr community", email = "dev@graphar.apache.org" }]
 requires-python = ">=3.7"
-dependencies = ["typer ~= 0.12.3", "pydantic ~= 2.8.2", "pyyaml ~= 6.0.2"]
+dependencies = ["typer ~= 0.1", "pydantic ~= 2.0", "pyyaml ~= 6.0"]
 
 [project.optional-dependencies]
-test = ["pandas ~= 2.2.2"]
+test = ["pandas ~= 2.0", "typing_extensions ~= 4.0"]
 
 [project.scripts]
-graphar-cli = "graphar_cli.graphar_cli:main"
+graphar = "graphar_cli.graphar_cli:main"
 
 
 [tool.scikit-build]
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index dd85175c7..17fc70147 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -17,33 +17,33 @@
 
 from logging import getLogger
 from pathlib import Path
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional  # TODO: move to the TYPE_CHECKING block
 
 from pydantic import BaseModel, field_validator, model_validator
 from typing_extensions import Self
 
 logger = getLogger("graphar_cli")
 
-# TODO: convert to constants
-default_file_type = "parquet"
-default_adj_list_type = "ordered_by_source"
-default_regular_separator = "_"
-default_validate_level = "weak"
-default_version = "gar/v1"
-support_file_types = {"parquet", "orc", "csv", "json"}
+# TODO: move them to constants.py
 
+DEFAULT_FILE_TYPE = "parquet"
+DEFAULT_ADJ_LIST_TYPE = "ordered_by_source"
+DEFAULT_REGULAR_SEPARATOR = "_"
+DEFAULT_VALIDATE_LEVEL = "weak"
+DEFAULT_VERSION = "gar/v1"
+SUPPORT_FILE_TYPES = {"parquet", "orc", "csv", "json"}
 
 class GraphArConfig(BaseModel):
     path: str
     name: str
     vertex_chunk_size: Optional[int] = 100
     edge_chunk_size: Optional[int] = 1024
-    file_type: Literal["parquet", "orc", "csv", "json"] = default_file_type
+    file_type: Literal["parquet", "orc", "csv", "json"] = DEFAULT_FILE_TYPE
     adj_list_type: Literal[
         "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
-    ] = default_adj_list_type
-    validate_level: Literal["no", "weak", "strong"] = default_validate_level
-    version: Optional[str] = default_version
+    ] = DEFAULT_ADJ_LIST_TYPE
+    validate_level: Literal["no", "weak", "strong"] = DEFAULT_VALIDATE_LEVEL
+    version: Optional[str] = DEFAULT_VERSION
 
     @field_validator("path")
     def check_path(cls, v):
@@ -57,6 +57,7 @@ def check_path(cls, v):
 
 
 class Property(BaseModel):
+
     name: str
     data_type: Literal["bool", "int32", "int64", "float", "double", "string", "date", "timestamp"]
     is_primary: bool = False
@@ -114,7 +115,7 @@ def check_file_type(self) -> Self:
             if file_type == "":
                 msg = f"File {self.path} has no file type suffix"
                 raise ValueError(msg)
-            if file_type not in support_file_types:
+            if file_type not in SUPPORT_FILE_TYPES:
                 msg = f"File type {file_type} not supported"
                 raise ValueError(msg)
             self.file_type = file_type
@@ -162,9 +163,9 @@ class AdjList(BaseModel):
 class Edge(BaseModel):
     edge_type: str
     src_type: str
-    src_prop: Optional[str] = None
+    src_prop: str
     dst_type: str
-    dst_prop: Optional[str] = None
+    dst_prop: str
     labels: List[str] = []
     chunk_size: Optional[int] = None
     validate_level: Optional[Literal["no", "weak", "strong"]] = None
@@ -189,8 +190,8 @@ def check_prefix(self) -> Self:
         if not prefix:
             self.prefix = (
                 f"edge/{src_type}"
-                f"{default_regular_separator}{edge_type}"
-                f"{default_regular_separator}{dst_type}/"
+                f"{DEFAULT_REGULAR_SEPARATOR}{edge_type}"
+                f"{DEFAULT_REGULAR_SEPARATOR}{dst_type}/"
             )
         return self
 
diff --git a/cli/src/graphar_cli/logging.py b/cli/src/graphar_cli/logging.py
index f8ef5b857..a2a595860 100644
--- a/cli/src/graphar_cli/logging.py
+++ b/cli/src/graphar_cli/logging.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import logging
-from typing import Union
+from typing import Union  # TODO: move to the TYPE_CHECKING block
 
 from rich.console import Console
 from rich.logging import RichHandler
diff --git a/cli/src/importer.h b/cli/src/importer.h
index ef6c093de..6baeaf44c 100644
--- a/cli/src/importer.h
+++ b/cli/src/importer.h
@@ -21,15 +21,14 @@
 
 #include <filesystem>
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <arrow/api.h>
-#include <graphar/api/arrow_writer.h>
-#include <graphar/api/high_level_writer.h>
-#include <graphar/convert_to_arrow_type.h>
-#include <graphar/graph_info.h>
-#include <graphar/high-level/graph_reader.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "arrow/api.h"
+#include "graphar/api/arrow_writer.h"
+#include "graphar/api/high_level_writer.h"
+#include "graphar/convert_to_arrow_type.h"
+#include "graphar/graph_info.h"
+#include "graphar/high-level/graph_reader.h"
 
 #include "util.h"
 
diff --git a/cli/src/main.cc b/cli/src/main.cc
index 03a02074f..84e690989 100644
--- a/cli/src/main.cc
+++ b/cli/src/main.cc
@@ -17,18 +17,19 @@
  * under the License.
  */
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
-#include <graphar/filesystem.h>
-#include <graphar/graph_info.h>
-#include <graphar/reader_util.h>
+#include "graphar/filesystem.h"
+#include "graphar/graph_info.h"
+#include "graphar/reader_util.h"
 #include "importer.h"
 
 #define STRINGIFY(x) #x
 #define MACRO_STRINGIFY(x) STRINGIFY(x)
 
 std::string ShowGraph(const std::string& path) {
+  // TODO: check all the result values
   auto graph_info = graphar::GraphInfo::Load(path).value();
   return graph_info->Dump().value();
 }
@@ -127,7 +128,7 @@ int64_t GetEdgeCount(const std::string& path, const std::string& src_type,
 std::vector<std::string> GetVertexTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto vertex_infos = graph_info->GetVertexInfos();
-  std::vector<std::string> vertex_types;
+  std::vector<std::string> vertex_types(vertex_infos.size());  
   for (const auto& vertex_info : vertex_infos) {
     vertex_types.push_back(vertex_info->GetType());
   }
@@ -137,7 +138,7 @@ std::vector<std::string> GetVertexTypes(const std::string& path) {
 std::vector<std::vector<std::string>> GetEdgeTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto edge_infos = graph_info->GetEdgeInfos();
-  std::vector<std::vector<std::string>> edge_types;
+  std::vector<std::vector<std::string>> edge_types(edge_infos.size());  
   for (const auto& edge_info : edge_infos) {
     std::vector<std::string> edge_type;
     edge_type.push_back(edge_info->GetSrcType());
diff --git a/cli/src/util.h b/cli/src/util.h
index b7ea1d326..600260441 100644
--- a/cli/src/util.h
+++ b/cli/src/util.h
@@ -19,18 +19,18 @@
 
 #pragma once
 
-#include <arrow/adapters/orc/adapter.h>
-#include <arrow/api.h>
-#include <arrow/compute/api.h>
-#include <arrow/csv/api.h>
-#include <arrow/io/api.h>
-#include <arrow/json/api.h>
-#include <graphar/api/arrow_writer.h>
-#include <graphar/api/high_level_writer.h>
-#include <graphar/graph_info.h>
-#include <parquet/arrow/reader.h>
-
-#include <iostream>
+#ifdef ARROW_ORC
+#include "arrow/adapters/orc/adapter.h"
+#endif
+#include "arrow/api.h"
+#include "arrow/compute/api.h"
+#include "arrow/csv/api.h"
+#include "arrow/io/api.h"
+#include "arrow/json/api.h"
+#include "graphar/api/arrow_writer.h"
+#include "graphar/api/high_level_writer.h"
+#include "graphar/graph_info.h"
+#include "parquet/arrow/reader.h"
 
 std::string ConcatEdgeTriple(const std::string& src_type,
                              const std::string& edge_type,
@@ -173,6 +173,7 @@ std::shared_ptr<arrow::Table> GetDataFromCsvFile(
   return table;
 }
 
+#ifdef ARROW_ORC
 std::shared_ptr<arrow::Table> GetDataFromOrcFile(
     const std::string& path, const std::vector<std::string>& column_names) {
   // Open the ORC file
@@ -205,6 +206,7 @@ std::shared_ptr<arrow::Table> GetDataFromOrcFile(
 
   return table;
 }
+#endif
 
 std::shared_ptr<arrow::Table> GetDataFromJsonFile(
     const std::string& path, const std::vector<std::string>& column_names) {
@@ -254,12 +256,15 @@ std::shared_ptr<arrow::Table> GetDataFromFile(
     const std::string& path, const std::vector<std::string>& column_names,
     const char& delimiter, const std::string& file_type) {
   // TODO: use explicit schema
+  // TODO: use switch case
   if (file_type == "parquet") {
     return GetDataFromParquetFile(path, column_names);
   } else if (file_type == "csv") {
     return GetDataFromCsvFile(path, column_names, delimiter);
+#ifdef ARROW_ORC
   } else if (file_type == "orc") {
     return GetDataFromOrcFile(path, column_names);
+#endif
   } else if (file_type == "json") {
     return GetDataFromJsonFile(path, column_names);
   } else {

From 77bc97c217ca92e45d0a4906a4666f5633f850fe Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Tue, 29 Oct 2024 16:52:00 +0800
Subject: [PATCH 20/30] fix ci

---
 cli/import.full.json                  | 223 --------------------------
 cli/import.full.yml                   |   6 -
 cli/import.json                       | 124 --------------
 cli/import.mini.json                  | 130 ---------------
 cli/src/graphar_cli/config.py         |   1 -
 cli/src/graphar_cli/graphar_cli.py    |   4 +-
 cli/src/importer.h                    |  11 +-
 cli/src/main.cc                       |   4 +-
 cpp/src/graphar/arrow/chunk_writer.cc |   6 +-
 cpp/src/graphar/arrow/chunk_writer.h  |   2 +-
 10 files changed, 14 insertions(+), 497 deletions(-)
 delete mode 100644 cli/import.full.json
 delete mode 100644 cli/import.json
 delete mode 100644 cli/import.mini.json

diff --git a/cli/import.full.json b/cli/import.full.json
deleted file mode 100644
index 86c6b8f0c..000000000
--- a/cli/import.full.json
+++ /dev/null
@@ -1,223 +0,0 @@
-{
-    "graphar": {
-        "path": "/workspaces/incubator-graphar/graphar/movie",
-        "name": "MovieGraph",
-        "vertex_chunk_size": 100,
-        "edge_chunk_size": 1024,
-        "file_type": "parquet",
-        "adj_list_type": "ordered_by_source",
-        "validate_level": "weak",
-        "version": "gar/v1"
-    },
-    "import_schema": {
-        "vertices": [
-            {
-                "type": "Person",
-                "labels": [
-                    "Person"
-                ],
-                "chunk_size": 100,
-                "validate_level": "no",
-                "prefix": "vertex/Person/",
-                "property_groups": [
-                    {
-                        "file_type": "parquet",
-                        "properties": [
-                            {
-                                "name": "name",
-                                "data_type": "string",
-                                "is_primary": true,
-                                "nullable": false
-                            },
-                            {
-                                "name": "born",
-                                "data_type": "int64",
-                                "is_primary": false,
-                                "nullable": true
-                            }
-                        ]
-                    }
-                ],
-                "sources": [
-                    {
-                        "file_type": "parquet",
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person.parquet",
-                        "columns": {
-                            "name": "name",
-                            "born": "born"
-                        }
-                    }
-                ]
-            },
-            {
-                "type": "Movie",
-                "labels": [
-                    "Movie"
-                ],
-                "chunk_size": 100,
-                "validate_level": "no",
-                "prefix": "vertex/Movie/",
-                "property_groups": [
-                    {
-                        "file_type": "parquet",
-                        "properties": [
-                            {
-                                "name": "title",
-                                "data_type": "string",
-                                "is_primary": true,
-                                "nullable": false
-                            },
-                            {
-                                "name": "tagline",
-                                "data_type": "string",
-                                "is_primary": false,
-                                "nullable": true
-                            }
-                        ]
-                    }
-                ],
-                "sources": [
-                    {
-                        "file_type": "orc",
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Movie.orc",
-                        "columns": {
-                            "title": "title",
-                            "tagline": "tagline"
-                        }
-                    }
-                ]
-            }
-        ],
-        "edges": [
-            {
-                "edge_type": "WROTE",
-                "src_type": "Person",
-                "src_prop": "name",
-                "dst_type": "Movie",
-                "dst_prop": "title",
-                "labels": [
-                    "WROTE"
-                ],
-                "chunk_size": 1024,
-                "validate_level": "no",
-                "prefix": "edge/Person_WROTE_Movie/",
-                "adj_lists": [
-                    {
-                        "ordered": true,
-                        "aligned_by": "src",
-                        "file_type": "parquet"
-                    },
-                    {
-                        "ordered": true,
-                        "aligned_by": "dst",
-                        "file_type": "parquet"
-                    }
-                ],
-                "sources": [
-                    {
-                        "file_type": "csv",
-                        "delimiter": ",",
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv",
-                        "columns": {
-                            "name": "name",
-                            "title": "title"
-                        }
-                    }
-                ]
-            },
-            {
-                "edge_type": "ACTED_IN",
-                "src_type": "Person",
-                "src_prop": "name",
-                "dst_type": "Movie",
-                "dst_prop": "title",
-                "validate_level": "no",
-                "labels": [
-                    "ACTED_IN"
-                ],
-                "chunk_size": 1024,
-                "prefix": "edge/Person_ACTED_IN_Movie/",
-                "adj_lists": [
-                    {
-                        "ordered": true,
-                        "aligned_by": "src",
-                        "file_type": "parquet"
-                    },
-                    {
-                        "ordered": true,
-                        "aligned_by": "dst",
-                        "file_type": "parquet"
-                    }
-                ],
-                "sources": [
-                    {
-                        "file_type": "json",
-                        "delimiter": ",",
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.json",
-                        "columns": {
-                            "name": "name",
-                            "title": "title"
-                        }
-                    }
-                ]
-            },
-            {
-                "edge_type": "REVIEWED",
-                "src_type": "Person",
-                "src_prop": "name",
-                "dst_type": "Movie",
-                "dst_prop": "title",
-                "labels": [
-                    "Review"
-                ],
-                "chunk_size": 1024,
-                "validate_level": "no",
-                "prefix": "edge/Person_REVIEWED_Movie/",
-                "property_groups": [
-                    {
-                        "file_type": "parquet",
-                        "properties": [
-                            {
-                                "name": "rating",
-                                "data_type": "int64",
-                                "is_primary": false,
-                                "nullable": true
-                            },
-                            {
-                                "name": "summary",
-                                "data_type": "string",
-                                "is_primary": false,
-                                "nullable": true
-                            }
-                        ]
-                    }
-                ],
-                "adj_lists": [
-                    {
-                        "ordered": true,
-                        "aligned_by": "src",
-                        "file_type": "parquet"
-                    },
-                    {
-                        "ordered": true,
-                        "aligned_by": "dst",
-                        "file_type": "parquet"
-                    }
-                ],
-                "sources": [
-                    {
-                        "file_type": "csv",
-                        "delimiter": ",",
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv",
-                        "columns": {
-                            "name": "name",
-                            "title": "title",
-                            "summary": "summary",
-                            "rating": "rating"
-                        }
-                    }
-                ]
-            }
-        ]
-    }
-}
\ No newline at end of file
diff --git a/cli/import.full.yml b/cli/import.full.yml
index 36e571b2f..cdc5611a3 100644
--- a/cli/import.full.yml
+++ b/cli/import.full.yml
@@ -62,8 +62,6 @@ import_schema:
       src_prop: name
       dst_type: Movie
       dst_prop: title
-      labels:
-        - WROTE
       chunk_size: 1024
       validate_level: "no"
       prefix: edge/Person_WROTE_Movie/
@@ -87,8 +85,6 @@ import_schema:
       dst_type: Movie
       dst_prop: title
       validate_level: "no"
-      labels:
-        - ACTED_IN
       chunk_size: 1024
       prefix: edge/Person_ACTED_IN_Movie/
       adj_lists:
@@ -110,8 +106,6 @@ import_schema:
       src_prop: name
       dst_type: Movie
       dst_prop: title
-      labels:
-        - Review
       chunk_size: 1024
       validate_level: "no"
       prefix: edge/Person_REVIEWED_Movie/
diff --git a/cli/import.json b/cli/import.json
deleted file mode 100644
index 66fbb0a0a..000000000
--- a/cli/import.json
+++ /dev/null
@@ -1,124 +0,0 @@
-{
-    "graphar": {
-        "path": "/tmp/graphar/movie",
-        "name": "MovieGraph",
-        "vertexChunkSize": 100,
-        "edgeChunkSize": 1024,
-        "fileType": "parquet",
-        "adjListType": "ordered_by_source"
-    },
-    "import_schema": {
-        "vertices": [
-            {
-                "type": "Person",
-                "labels": [
-                    "Person"
-                ],
-                "primary": "name",
-                "properties": [
-                    {
-                        "name": "name",
-                        "type": "string"
-                    },
-                    {
-                        "name": "born",
-                        "type": "int64",
-                        "nullable": true
-                    }
-                ],
-                "propertyGroups": [
-                    [
-                        "name",
-                        "born"
-                    ]
-                ],
-                "source": {
-                    "type": "parquet",
-                    "path": "/tmp/graphar/movie/Person.parquet",
-                    "columns": {
-                        "name": "name",
-                        "born": "born"
-                    }
-                }
-            },
-            {
-                "type": "Movie",
-                "labels": [
-                    "Movie"
-                ],
-                "primary": "title",
-                "properties": [
-                    {
-                        "name": "title",
-                        "type": "string"
-                    },
-                    {
-                        "name": "tagline",
-                        "type": "string",
-                        "nullable": true
-                    }
-                ],
-                "propertyGroups": [
-                    [
-                        "title",
-                        "tagline"
-                    ]
-                ],
-                "source": {
-                    "type": "parquet",
-                    "path": "/tmp/graphar/movie/Movie.csv",
-                    "delimiter": ",",
-                    "columns": {
-                        "title": "title",
-                        "tagline": "tagline"
-                    }
-                }
-            }
-        ],
-        "edges": [
-            {
-                "type": "PRODUCED",
-                "adjListType": "ordered_by_dest",
-                "srcType": "Person",
-                "srcProp": "name",
-                "dstType": "Movie",
-                "dstProp": "title",
-                "source": {
-                    "type": "parquet",
-                    "path": "/tmp/graphar/movie/Produced.csv",
-                    "columns": {
-                        "srcIndex": "_graphArSrcIndex",
-                        "dstIndex": "_graphArDstIndex"
-                    }
-                }
-            },
-            {
-                "type": "REVIEWED",
-                "srcType": "Person",
-                "srcProp": "name",
-                "dstType": "Movie",
-                "dstProp": "title",
-                "properties": [
-                    {
-                        "name": "rating",
-                        "type": "string"
-                    },
-                    {
-                        "name": "summary",
-                        "type": "string"
-                    }
-                ],
-                "source": {
-                    "type": "parquet",
-                    "path": "/tmp/graphar/movie/Produced.csv",
-                    "columns": {
-                        "srcIndex": "_graphArSrcIndex",
-                        "dstIndex": "_graphArDstIndex",
-                        "rating": "rating",
-                        "summary": "summary"
-                    }
-                }
-            }
-        ]
-    }
-}
\ No newline at end of file
diff --git a/cli/import.mini.json b/cli/import.mini.json
deleted file mode 100644
index 8d85c77f0..000000000
--- a/cli/import.mini.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-    "graphar": {
-        "path": "/workspaces/incubator-graphar/graphar/movie",
-        "name": "MovieGraph"
-    },
-    "import_schema": {
-        "vertices": [
-            {
-                "type": "Person",
-                "property_groups": [
-                    {
-                        "properties": [
-                            {
-                                "name": "name",
-                                "data_type": "string",
-                                "is_primary": true
-                            },
-                            {
-                                "name": "born",
-                                "data_type": "int64"
-                            }
-                        ]
-                    }
-                ],
-                "sources": [
-                    {
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person.parquet",
-                        "columns": {
-                            "name": "name",
-                            "born": "born"
-                        }
-                    }
-                ]
-            },
-            {
-                "type": "Movie",
-                "property_groups": [
-                    {
-                        "properties": [
-                            {
-                                "name": "title",
-                                "data_type": "string",
-                                "is_primary": true
-                            },
-                            {
-                                "name": "tagline",
-                                "data_type": "string"
-                            }
-                        ]
-                    }
-                ],
-                "sources": [
-                    {
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Movie.orc",
-                        "columns": {
-                            "title": "title",
-                            "tagline": "tagline"
-                        }
-                    }
-                ]
-            }
-        ],
-        "edges": [
-            {
-                "edge_type": "WROTE",
-                "src_type": "Person",
-                "src_prop": "name",
-                "dst_type": "Movie",
-                "dst_prop": "title",
-                "sources": [
-                    {
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv",
-                        "columns": {
-                            "name": "name",
-                            "title": "title"
-                        }
-                    }
-                ]
-            },
-            {
-                "edge_type": "ACTED_IN",
-                "src_type": "Person",
-                "src_prop": "name",
-                "dst_type": "Movie",
-                "dst_prop": "title",
-                "sources": [
-                    {
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.csv",
-                        "columns": {
-                            "name": "name",
-                            "title": "title"
-                        }
-                    }
-                ]
-            },
-            {
-                "edge_type": "REVIEWED",
-                "src_type": "Person",
-                "src_prop": "name",
-                "dst_type": "Movie",
-                "dst_prop": "title",
-                "property_groups": [
-                    {
-                        "properties": [
-                            {
-                                "name": "rating",
-                                "data_type": "int64"
-                            },
-                            {
-                                "name": "summary",
-                                "data_type": "string"
-                            }
-                        ]
-                    }
-                ],
-                "sources": [
-                    {
-                        "path": "/workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv",
-                        "columns": {
-                            "name": "name",
-                            "title": "title",
-                            "summary": "summary",
-                            "rating": "rating"
-                        }
-                    }
-                ]
-            }
-        ]
-    }
-}
\ No newline at end of file
diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index 17fc70147..c7b337772 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -166,7 +166,6 @@ class Edge(BaseModel):
     src_prop: str
     dst_type: str
     dst_prop: str
-    labels: List[str] = []
     chunk_size: Optional[int] = None
     validate_level: Optional[Literal["no", "weak", "strong"]] = None
     adj_lists: List[AdjList] = []
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index cf610ca1d..193575e85 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -57,7 +57,7 @@
     no_args_is_help=True,
 )
 def show(
-    path: str = typer.Option(None, "--path", "-f", help="Path to the GraphAr config file"),
+    path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
     vertex: str = typer.Option(None, "--vertex", "-v", help="Vertex type to show"),
     edge_src: str = typer.Option(None, "--edge-src", "-es", help="Source of the edge type to show"),
     edge: str = typer.Option(None, "--edge", "-e", help="Edge type to show"),
@@ -108,7 +108,7 @@ def show(
     no_args_is_help=True,
 )
 def check(
-    path: str = typer.Option(None, "--path", "-f", help="Path to the GraphAr config file"),
+    path: str = typer.Option(None, "--path", "-p", help="Path to the GraphAr config file"),
 ):
     if not Path(path).exists():
         logger.error("File not found: %s", path)
diff --git a/cli/src/importer.h b/cli/src/importer.h
index 6baeaf44c..e142de4a4 100644
--- a/cli/src/importer.h
+++ b/cli/src/importer.h
@@ -21,14 +21,14 @@
 
 #include <filesystem>
 
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
 #include "arrow/api.h"
 #include "graphar/api/arrow_writer.h"
 #include "graphar/api/high_level_writer.h"
 #include "graphar/convert_to_arrow_type.h"
 #include "graphar/graph_info.h"
 #include "graphar/high-level/graph_reader.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
 #include "util.h"
 
@@ -265,8 +265,9 @@ std::string DoImport(const py::dict& config_dict) {
       pgs.emplace_back(property_group);
     }
 
-    auto vertex_info = graphar::CreateVertexInfo(vertex.type, vertex.chunk_size,
-                                                 pgs, vertex.prefix, version);
+    auto vertex_info =
+        graphar::CreateVertexInfo(vertex.type, vertex.chunk_size, pgs,
+                                  vertex.labels, vertex.prefix, version);
     auto file_name = vertex.type + ".vertex.yml";
     vertex_info->Save(save_path / file_name);
     auto save_path_str = save_path.string();
@@ -329,7 +330,7 @@ std::string DoImport(const py::dict& config_dict) {
 
     auto vertex_table_with_index =
         vertex_prop_writer
-            ->addIndexColumn(merged_vertex_table, start_chunk_index,
+            ->AddIndexColumn(merged_vertex_table, start_chunk_index,
                              vertex_info->GetChunkSize())
             .value();
     for (const auto& property_group : pgs) {
diff --git a/cli/src/main.cc b/cli/src/main.cc
index 84e690989..b3633c19f 100644
--- a/cli/src/main.cc
+++ b/cli/src/main.cc
@@ -128,7 +128,7 @@ int64_t GetEdgeCount(const std::string& path, const std::string& src_type,
 std::vector<std::string> GetVertexTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto vertex_infos = graph_info->GetVertexInfos();
-  std::vector<std::string> vertex_types(vertex_infos.size());  
+  std::vector<std::string> vertex_types(vertex_infos.size());
   for (const auto& vertex_info : vertex_infos) {
     vertex_types.push_back(vertex_info->GetType());
   }
@@ -138,7 +138,7 @@ std::vector<std::string> GetVertexTypes(const std::string& path) {
 std::vector<std::vector<std::string>> GetEdgeTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto edge_infos = graph_info->GetEdgeInfos();
-  std::vector<std::vector<std::string>> edge_types(edge_infos.size());  
+  std::vector<std::vector<std::string>> edge_types;
   for (const auto& edge_info : edge_infos) {
     std::vector<std::string> edge_type;
     edge_type.push_back(edge_info->GetSrcType());
diff --git a/cpp/src/graphar/arrow/chunk_writer.cc b/cpp/src/graphar/arrow/chunk_writer.cc
index 2e3e73539..3654aaec4 100644
--- a/cpp/src/graphar/arrow/chunk_writer.cc
+++ b/cpp/src/graphar/arrow/chunk_writer.cc
@@ -261,7 +261,7 @@ Status VertexPropertyWriter::WriteTable(
   if (indice == -1) {
     // add index column
     GAR_ASSIGN_OR_RAISE(table_with_index,
-                        addIndexColumn(input_table, start_chunk_index,
+                        AddIndexColumn(input_table, start_chunk_index,
                                        vertex_info_->GetChunkSize()));
   }
   IdType chunk_size = vertex_info_->GetChunkSize();
@@ -281,7 +281,7 @@ Status VertexPropertyWriter::WriteTable(
     ValidateLevel validate_level) const {
   auto property_groups = vertex_info_->GetPropertyGroups();
   GAR_ASSIGN_OR_RAISE(auto table_with_index,
-                      addIndexColumn(input_table, start_chunk_index,
+                      AddIndexColumn(input_table, start_chunk_index,
                                      vertex_info_->GetChunkSize()));
   for (auto& property_group : property_groups) {
     GAR_RETURN_NOT_OK(WriteTable(table_with_index, property_group,
@@ -307,7 +307,7 @@ Result<std::shared_ptr<VertexPropertyWriter>> VertexPropertyWriter::Make(
   return Make(vertex_info, graph_info->GetPrefix(), validate_level);
 }
 
-Result<std::shared_ptr<arrow::Table>> VertexPropertyWriter::addIndexColumn(
+Result<std::shared_ptr<arrow::Table>> VertexPropertyWriter::AddIndexColumn(
     const std::shared_ptr<arrow::Table>& table, IdType chunk_index,
     IdType chunk_size) const {
   arrow::Int64Builder array_builder;
diff --git a/cpp/src/graphar/arrow/chunk_writer.h b/cpp/src/graphar/arrow/chunk_writer.h
index 47edeff2e..e9a71df48 100644
--- a/cpp/src/graphar/arrow/chunk_writer.h
+++ b/cpp/src/graphar/arrow/chunk_writer.h
@@ -187,7 +187,7 @@ class VertexPropertyWriter {
       const std::shared_ptr<GraphInfo>& graph_info, const std::string& type,
       const ValidateLevel& validate_level = ValidateLevel::no_validate);
 
-  Result<std::shared_ptr<arrow::Table>> addIndexColumn(
+  Result<std::shared_ptr<arrow::Table>> AddIndexColumn(
       const std::shared_ptr<arrow::Table>& table, IdType chunk_index,
       IdType chunk_size) const;
 

From 56aa77b161a476b6700c1e2096b110da216ce68c Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Tue, 29 Oct 2024 16:56:01 +0800
Subject: [PATCH 21/30] fix ci

---
 .github/workflows/cli.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index a316ee8f4..7e147e912 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -68,11 +68,11 @@ jobs:
       working-directory: "cli"
       run: |
         pip install ./ -v
-        graphar-cli --help
-        graphar-cli check -f ../testing/neo4j/MovieGraph.graph.yml
-        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -v Person
-        graphar-cli show -f ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-        graphar-cli import -c import.mini.yml
+        graphar --help
+        graphar check -p ../testing/neo4j/MovieGraph.graph.yml
+        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
+        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+        graphar import -c import.mini.yml
 
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@v4.0.1

From 955d6096f5d507fce49a9af3f90db0eb2a4ab2be Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Tue, 29 Oct 2024 17:03:37 +0800
Subject: [PATCH 22/30] fix ci

---
 cli/src/main.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cli/src/main.cc b/cli/src/main.cc
index b3633c19f..4a0b03469 100644
--- a/cli/src/main.cc
+++ b/cli/src/main.cc
@@ -128,7 +128,8 @@ int64_t GetEdgeCount(const std::string& path, const std::string& src_type,
 std::vector<std::string> GetVertexTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto vertex_infos = graph_info->GetVertexInfos();
-  std::vector<std::string> vertex_types(vertex_infos.size());
+  // TODO: change to unordered_set
+  std::vector<std::string> vertex_types;
   for (const auto& vertex_info : vertex_infos) {
     vertex_types.push_back(vertex_info->GetType());
   }
@@ -138,6 +139,7 @@ std::vector<std::string> GetVertexTypes(const std::string& path) {
 std::vector<std::vector<std::string>> GetEdgeTypes(const std::string& path) {
   auto graph_info = graphar::GraphInfo::Load(path).value();
   auto edge_infos = graph_info->GetEdgeInfos();
+  // TODO: change to unordered_set
   std::vector<std::vector<std::string>> edge_types;
   for (const auto& edge_info : edge_infos) {
     std::vector<std::string> edge_type;

From 726ea84d851cf5a08dc22abe6d0429855484dd44 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Tue, 29 Oct 2024 17:20:03 +0800
Subject: [PATCH 23/30] fix ci

---
 .github/workflows/cli.yml | 2 +-
 cli/import.full.yml       | 2 +-
 cli/src/importer.h        | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index 7e147e912..8363ff34a 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -109,6 +109,6 @@ jobs:
         graphar check -p ../testing/neo4j/MovieGraph.graph.yml
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-        graphar import -c import.mini.yml
+        graphar import -c import.full.yml
       
       # TODO: Add unit tests
diff --git a/cli/import.full.yml b/cli/import.full.yml
index cdc5611a3..ad05cc733 100644
--- a/cli/import.full.yml
+++ b/cli/import.full.yml
@@ -1,5 +1,5 @@
 graphar:                                              # Global configuration of imported data
-  path: /workspaces/incubator-graphar/graphar/movie   # (Required) Path to the graphar directory
+  path: /tmp/graphar/movie   # (Required) Path to the graphar directory
   name: MovieGraph                                    # (Required) Name of the graph
   vertex_chunk_size: 100                              # Number of vertices per chunk
   edge_chunk_size: 1024                               # Number of edges per chunk
diff --git a/cli/src/importer.h b/cli/src/importer.h
index e142de4a4..11e4a6bd0 100644
--- a/cli/src/importer.h
+++ b/cli/src/importer.h
@@ -82,7 +82,6 @@ struct Edge {
   std::string src_prop;
   std::string dst_type;
   std::string dst_prop;
-  std::vector<std::string> labels;
   int chunk_size;
   std::string validate_level;
   std::string prefix;
@@ -161,7 +160,6 @@ ImportConfig ConvertPyDictToConfig(const py::dict& config_dict) {
     edge.src_prop = edge_dict["src_prop"].cast<std::string>();
     edge.dst_type = edge_dict["dst_type"].cast<std::string>();
     edge.dst_prop = edge_dict["dst_prop"].cast<std::string>();
-    edge.labels = edge_dict["labels"].cast<std::vector<std::string>>();
     edge.chunk_size = edge_dict["chunk_size"].cast<int>();
     edge.validate_level = edge_dict["validate_level"].cast<std::string>();
     edge.prefix = edge_dict["prefix"].cast<std::string>();

From f0fe5e9fcdcde2789c2b433389f09c39c60ad764 Mon Sep 17 00:00:00 2001
From: Liu Jiajun <939282975@qq.com>
Date: Wed, 30 Oct 2024 00:51:16 +0800
Subject: [PATCH 24/30] Update cli.yml

---
 .github/workflows/cli.yml | 60 ++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index 8363ff34a..e63de03a1 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -73,42 +73,44 @@ jobs:
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
         graphar import -c import.mini.yml
+# TODO: Add unit tests
+
 
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@v4.0.1
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
 
-  macos:
-    name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} CLI
-    runs-on: macos-${{ matrix.macos-version }}
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.event.pull_request.draft == false }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - architecture: AMD64
-            macos-version: "12"
-          - architecture: ARM64
-            macos-version: "14"
-    steps:
-    - uses: actions/checkout@v3
-      with:
-          submodules: true
+  # macos:
+  #   name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} CLI
+  #   runs-on: macos-${{ matrix.macos-version }}
+  #   if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.event.pull_request.draft == false }}
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - architecture: AMD64
+  #           macos-version: "12"
+  #         - architecture: ARM64
+  #           macos-version: "14"
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #     with:
+  #         submodules: true
 
-    - name: Install dependencies
-      run: |
-        brew bundle --file=cpp/Brewfile
+  #   - name: Install dependencies
+  #     run: |
+  #       brew bundle --file=cpp/Brewfile
         
     
-    - name: Build GraphAr And Run Tests
-      working-directory: "cli"
-      run: |
-        pip install ./
-        graphar --help
-        graphar check -p ../testing/neo4j/MovieGraph.graph.yml
-        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
-        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-        graphar import -c import.full.yml
+  #   - name: Build GraphAr And Run Tests
+  #     working-directory: "cli"
+  #     run: |
+  #       pip install ./
+  #       graphar --help
+  #       graphar check -p ../testing/neo4j/MovieGraph.graph.yml
+  #       graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
+  #       graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+  #       graphar import -c import.full.yml
       
-      # TODO: Add unit tests
+# TODO: Add unit tests

From 18ba462fb400c32c604c95e91b439bc8d81187d8 Mon Sep 17 00:00:00 2001
From: Liu Jiajun <939282975@qq.com>
Date: Wed, 30 Oct 2024 20:41:12 +0800
Subject: [PATCH 25/30] Update cli.yml

---
 .github/workflows/cli.yml | 56 +++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index e63de03a1..9d2284d69 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -81,36 +81,36 @@ jobs:
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
 
-  # macos:
-  #   name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} CLI
-  #   runs-on: macos-${{ matrix.macos-version }}
-  #   if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.event.pull_request.draft == false }}
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - architecture: AMD64
-  #           macos-version: "12"
-  #         - architecture: ARM64
-  #           macos-version: "14"
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #     with:
-  #         submodules: true
+  macos:
+    name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} CLI
+    runs-on: macos-${{ matrix.macos-version }}
+    if: false
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - architecture: AMD64
+            macos-version: "12"
+          - architecture: ARM64
+            macos-version: "14"
+    steps:
+    - uses: actions/checkout@v3
+      with:
+          submodules: true
 
-  #   - name: Install dependencies
-  #     run: |
-  #       brew bundle --file=cpp/Brewfile
+    - name: Install dependencies
+      run: |
+        brew bundle --file=cpp/Brewfile
         
     
-  #   - name: Build GraphAr And Run Tests
-  #     working-directory: "cli"
-  #     run: |
-  #       pip install ./
-  #       graphar --help
-  #       graphar check -p ../testing/neo4j/MovieGraph.graph.yml
-  #       graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
-  #       graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-  #       graphar import -c import.full.yml
+    - name: Build GraphAr And Run Tests
+      working-directory: "cli"
+      run: |
+        pip install ./
+        graphar --help
+        graphar check -p ../testing/neo4j/MovieGraph.graph.yml
+        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
+        graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
+        graphar import -c import.full.yml
       
 # TODO: Add unit tests

From 4af91177b37cb025d0aa05fe14fda3e951b1bceb Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Thu, 31 Oct 2024 02:47:11 +0800
Subject: [PATCH 26/30] fix config type

---
 .github/workflows/cli.yml          |   7 +-
 cli/README.md                      |   7 +-
 cli/import.full.yml                | 138 -----------------------------
 cli/import.mini.yml                |  71 ---------------
 cli/src/graphar_cli/graphar_cli.py |  18 +---
 5 files changed, 9 insertions(+), 232 deletions(-)
 delete mode 100644 cli/import.full.yml
 delete mode 100644 cli/import.mini.yml

diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
index 9d2284d69..afc04eeb6 100644
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -72,7 +72,7 @@ jobs:
         graphar check -p ../testing/neo4j/MovieGraph.graph.yml
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-        graphar import -c import.mini.yml
+        graphar import -c ../testing/neo4j/data/import.mini.yml
 # TODO: Add unit tests
 
 
@@ -84,7 +84,8 @@ jobs:
   macos:
     name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} CLI
     runs-on: macos-${{ matrix.macos-version }}
-    if: false
+    # TODO: Remove this when the macos issue is fixed
+    if: false   
     strategy:
       fail-fast: false
       matrix:
@@ -111,6 +112,6 @@ jobs:
         graphar check -p ../testing/neo4j/MovieGraph.graph.yml
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
         graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
-        graphar import -c import.full.yml
+        graphar import -c ../testing/neo4j/data/import.mini.yml
       
 # TODO: Add unit tests
diff --git a/cli/README.md b/cli/README.md
index 26c8eeacd..0f2ac54f2 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -39,12 +39,12 @@ graphar show -p ../testing/neo4j/MovieGraph.graph.yml -v Person
 graphar show -p ../testing/neo4j/MovieGraph.graph.yml -es Person -e ACTED_IN -ed Movie
 
 # import graph data by using a config file
-graphar import -c import.mini.yml
+graphar import -c ../testing/neo4j/data/import.mini.yml
 ```
 
 ## Import config file
 
-The config file supports two data types. We provide two reference templates for each type: full and mini.
+The config file supports `yaml` data type. We provide two reference templates for it: full and mini.
 
 The full version of the configuration file contains all configurable fields, and additional fields will be automatically ignored.
 
@@ -52,8 +52,7 @@ The mini version of the configuration file is a simplified version of the full c
 
 For the full configuration file, if all fields can be set to their default values, you can simplify it to the mini version. However, it cannot be further reduced beyond the mini version.
 
-
-In the yaml config files, we provide brief comments on the fields, which can be used as a reference
+In the full `yaml` config file, we provide brief comments on the fields, which can be used as a reference.
 
 **Example**
 
diff --git a/cli/import.full.yml b/cli/import.full.yml
deleted file mode 100644
index ad05cc733..000000000
--- a/cli/import.full.yml
+++ /dev/null
@@ -1,138 +0,0 @@
-graphar:                                              # Global configuration of imported data
-  path: /tmp/graphar/movie   # (Required) Path to the graphar directory
-  name: MovieGraph                                    # (Required) Name of the graph
-  vertex_chunk_size: 100                              # Number of vertices per chunk
-  edge_chunk_size: 1024                               # Number of edges per chunk
-  file_type: parquet                                  # Default file type for vertices and edges, can be "parquet", "orc", "csv", "json"
-  adj_list_type: ordered_by_source                    # Default adjacency list type, can be "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
-  validate_level: weak                                # Default validation level for vertices and edges, can be "no", "weak", "strong"
-  version: gar/v1                                     # Version of the graphar schema
-
-import_schema:
-  vertices:
-    - type: Person
-      labels:
-        - Person
-      chunk_size: 100
-      validate_level: "no"
-      prefix: vertex/Person/
-      property_groups:
-        - file_type: parquet
-          properties:
-            - name: name
-              data_type: string
-              is_primary: true
-              nullable: false
-            - name: born
-              data_type: int64
-              is_primary: false
-              nullable: true
-      sources:
-        - file_type: parquet
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Person.parquet
-          columns:
-            name: name
-            born: born
-    - type: Movie
-      labels:
-        - Movie
-      chunk_size: 100
-      validate_level: "no"
-      prefix: vertex/Movie/
-      property_groups:
-        - file_type: parquet
-          properties:
-            - name: title
-              data_type: string
-              is_primary: true
-              nullable: false
-            - name: tagline
-              data_type: string
-              is_primary: false
-              nullable: true
-      sources:
-        - file_type: orc
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Movie.orc
-          columns:
-            title: title
-            tagline: tagline
-  edges:
-    - edge_type: WROTE
-      src_type: Person
-      src_prop: name
-      dst_type: Movie
-      dst_prop: title
-      chunk_size: 1024
-      validate_level: "no"
-      prefix: edge/Person_WROTE_Movie/
-      adj_lists:
-        - ordered: true
-          aligned_by: src
-          file_type: parquet
-        - ordered: true
-          aligned_by: dst
-          file_type: parquet
-      sources:
-        - file_type: csv
-          delimiter: ","
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_WROTE_Movie.csv
-          columns:
-            name: name
-            title: title
-    - edge_type: ACTED_IN
-      src_type: Person
-      src_prop: name
-      dst_type: Movie
-      dst_prop: title
-      validate_level: "no"
-      chunk_size: 1024
-      prefix: edge/Person_ACTED_IN_Movie/
-      adj_lists:
-        - ordered: true
-          aligned_by: src
-          file_type: parquet
-        - ordered: true
-          aligned_by: dst
-          file_type: parquet
-      sources:
-        - file_type: json
-          delimiter: ","    # will be ignored
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_ACTED_IN_Movie.json
-          columns:
-            name: name
-            title: title
-    - edge_type: REVIEWED
-      src_type: Person
-      src_prop: name
-      dst_type: Movie
-      dst_prop: title
-      chunk_size: 1024
-      validate_level: "no"
-      prefix: edge/Person_REVIEWED_Movie/
-      property_groups:
-        - file_type: parquet
-          properties:
-            - name: rating
-              data_type: int64
-              is_primary: false
-              nullable: true
-            - name: summary
-              data_type: string
-              is_primary: false
-              nullable: true
-      adj_lists:
-        - ordered: true
-          aligned_by: src
-          file_type: parquet
-        - ordered: true
-          aligned_by: dst
-          file_type: parquet
-      sources:
-        - file_type: csv
-          delimiter: ","
-          path: /workspaces/incubator-graphar/testing/neo4j/data/Person_REVIEWED_Movie.csv
-          columns:
-            name: name
-            title: title
-            summary: summary
-            rating: rating
diff --git a/cli/import.mini.yml b/cli/import.mini.yml
deleted file mode 100644
index 86cf92f37..000000000
--- a/cli/import.mini.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-graphar:
-  path: /tmp/graphar/movie
-  name: MovieGraph
-
-import_schema:
-  vertices:
-    - type: Person
-      property_groups:
-        - properties:
-            - name: name
-              data_type: string
-              is_primary: true
-            - name: born
-              data_type: int64
-      sources:
-        - path: ../testing/neo4j/data/Person.parquet
-          columns:
-            name: name
-            born: born
-    - type: Movie
-      property_groups:
-        - properties:
-            - name: title
-              data_type: string
-              is_primary: true
-            - name: tagline
-              data_type: string
-      sources:
-        - path: ../testing/neo4j/data/Movie.orc
-          columns:
-            title: title
-            tagline: tagline
-  edges:
-    - edge_type: WROTE
-      src_type: Person
-      src_prop: name
-      dst_type: Movie
-      dst_prop: title
-      sources:
-        - path: ../testing/neo4j/data/Person_WROTE_Movie.csv
-          columns:
-            name: name
-            title: title
-    - edge_type: ACTED_IN
-      src_type: Person
-      src_prop: name
-      dst_type: Movie
-      dst_prop: title
-      sources:
-        - path: ../testing/neo4j/data/Person_ACTED_IN_Movie.json
-          columns:
-            name: name
-            title: title
-    - edge_type: REVIEWED
-      src_type: Person
-      src_prop: name
-      dst_type: Movie
-      dst_prop: title
-      property_groups:
-        - properties:
-          - name: rating
-            data_type: int64
-          - name: summary
-            data_type: string
-      sources:
-        - path: ../testing/neo4j/data/Person_REVIEWED_Movie.csv
-          columns:
-            name: name
-            title: title
-            summary: summary
-            rating: rating
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index 193575e85..725029d11 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -147,28 +147,14 @@ def check(
 )
 def import_data(
     config_file: str = typer.Option(None, "--config", "-c", help="Path of the GraphAr config file"),
-    config_type: str = typer.Option(
-        "yaml",
-        "--type",
-        "-t",
-        help="Type of config file",
-        show_choices=True,
-        show_default=True,
-    ),
 ):
     if not Path(config_file).is_file():
         logger.error("File not found: %s", config_file)
         raise typer.Exit(1)
-    if config_type == "json":
-        with Path(config_file).open(encoding="utf-8") as file:
-            config = json.load(file)
-    elif config_type == "yaml":
+
+    try:
         with Path(config_file).open(encoding="utf-8") as file:
             config = yaml.safe_load(file)
-    else:
-        logger.error("Config type %s not supported", config_type)
-        raise typer.Exit(1)
-    try:
         import_config = ImportConfig(**config)
         validate(import_config)
     except Exception as e:

From 7b81e42da00f64d237c1202d10eb6cc0136aef2e Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Thu, 31 Oct 2024 15:00:36 +0800
Subject: [PATCH 27/30] fix ci with new testing

---
 testing | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testing b/testing
index 0b22fba6f..955596c32 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 0b22fba6f9e0f467b1121feae379e60eb2eb9f99
+Subproject commit 955596c325ceba7b607e285738e3dd0ce4ff424e

From 6ddac6198f2813c33d93d4c6aa4b50feb557342d Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Thu, 31 Oct 2024 16:32:57 +0800
Subject: [PATCH 28/30] use enum

---
 cli/src/graphar_cli/config.py      | 28 ++++++++++++++++++++--------
 cli/src/graphar_cli/graphar_cli.py |  1 -
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index c7b337772..eb9eda9e6 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from enum import Enum
 from logging import getLogger
 from pathlib import Path
 from typing import Dict, List, Literal, Optional  # TODO: move to the TYPE_CHECKING block
 
-from pydantic import BaseModel, field_validator, model_validator
+from pydantic import BaseModel, ConfigDict, field_validator, model_validator
 from typing_extensions import Self
 
 logger = getLogger("graphar_cli")
@@ -31,7 +32,14 @@
 DEFAULT_REGULAR_SEPARATOR = "_"
 DEFAULT_VALIDATE_LEVEL = "weak"
 DEFAULT_VERSION = "gar/v1"
-SUPPORT_FILE_TYPES = {"parquet", "orc", "csv", "json"}
+
+
+class FileType(str, Enum):
+    parquet = "parquet"
+    csv = "csv"
+    orc = "orc"
+    json = "json"
+
 
 class GraphArConfig(BaseModel):
     path: str
@@ -57,7 +65,6 @@ def check_path(cls, v):
 
 
 class Property(BaseModel):
-
     name: str
     data_type: Literal["bool", "int32", "int64", "float", "double", "string", "date", "timestamp"]
     is_primary: bool = False
@@ -77,7 +84,7 @@ def check_nullable(self) -> Self:
 
 class PropertyGroup(BaseModel):
     properties: List[Property]
-    file_type: Optional[Literal["parquet", "orc", "csv", "json"]] = None
+    file_type: Optional[FileType] = None
 
     @field_validator("properties")
     def check_properties_length(cls, v):
@@ -88,7 +95,7 @@ def check_properties_length(cls, v):
 
 
 class Source(BaseModel):
-    file_type: Optional[Literal["parquet", "orc", "csv", "json"]] = None
+    file_type: Optional[FileType] = None
     path: str
     delimiter: str = ","
     columns: Dict[str, str]
@@ -115,8 +122,8 @@ def check_file_type(self) -> Self:
             if file_type == "":
                 msg = f"File {self.path} has no file type suffix"
                 raise ValueError(msg)
-            if file_type not in SUPPORT_FILE_TYPES:
-                msg = f"File type {file_type} not supported"
+            if file_type not in FileType.__members__:
+                msg = f"Invalid file type '{file_type}'"
                 raise ValueError(msg)
             self.file_type = file_type
         return self
@@ -157,7 +164,7 @@ def check_vertex_prefix(self) -> Self:
 class AdjList(BaseModel):
     ordered: bool
     aligned_by: Literal["src", "dst"]
-    file_type: Literal["parquet", "orc", "csv", "json"]
+    file_type: Optional[FileType] = None
 
 
 class Edge(BaseModel):
@@ -208,6 +215,8 @@ def check_property_groups_length(cls, v):
 
 
 class ImportConfig(BaseModel):
+    model_config = ConfigDict(use_enum_values=True)
+
     graphar: GraphArConfig
     import_schema: ImportSchema
 
@@ -246,6 +255,9 @@ def check_none_types(self) -> Self:
                 else:
                     msg = f"Invalid adj_list_type '{self.graphar.adj_list_type}'"
                     raise ValueError(msg)
+            for adj_list in edge.adj_lists:
+                if adj_list.file_type is None:
+                    adj_list.file_type = self.graphar.file_type
             for property_group in edge.property_groups:
                 if property_group.file_type is None:
                     property_group.file_type = self.graphar.file_type
diff --git a/cli/src/graphar_cli/graphar_cli.py b/cli/src/graphar_cli/graphar_cli.py
index 725029d11..1c5be2e6a 100644
--- a/cli/src/graphar_cli/graphar_cli.py
+++ b/cli/src/graphar_cli/graphar_cli.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import json
 from logging import getLogger
 from pathlib import Path
 from typing import List

From 9f3258b85c0640b7b846a2fe804d446f03db72b1 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Thu, 31 Oct 2024 16:42:12 +0800
Subject: [PATCH 29/30] use enum

---
 cli/src/graphar_cli/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/src/graphar_cli/config.py b/cli/src/graphar_cli/config.py
index eb9eda9e6..7322ddb41 100644
--- a/cli/src/graphar_cli/config.py
+++ b/cli/src/graphar_cli/config.py
@@ -46,7 +46,7 @@ class GraphArConfig(BaseModel):
     name: str
     vertex_chunk_size: Optional[int] = 100
     edge_chunk_size: Optional[int] = 1024
-    file_type: Literal["parquet", "orc", "csv", "json"] = DEFAULT_FILE_TYPE
+    file_type: FileType = DEFAULT_FILE_TYPE
     adj_list_type: Literal[
         "ordered_by_source", "ordered_by_dest", "unordered_by_source", "unordered_by_dest"
     ] = DEFAULT_ADJ_LIST_TYPE

From 2aa64ad6d9fb4841f8edca97a39e15f945589289 Mon Sep 17 00:00:00 2001
From: jasinliu <939282975@qq.com>
Date: Thu, 31 Oct 2024 17:27:27 +0800
Subject: [PATCH 30/30] pin pydantic  version

---
 cli/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index 5dda7103b..8c0003a0b 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -10,7 +10,7 @@ description = "GraphAr command line tool"
 readme = "README.md"
 authors = [{ name = "GraphAr community", email = "dev@graphar.apache.org" }]
 requires-python = ">=3.7"
-dependencies = ["typer ~= 0.1", "pydantic ~= 2.0", "pyyaml ~= 6.0"]
+dependencies = ["typer ~= 0.1", "pydantic ~= 2.0, < 2.7", "pyyaml ~= 6.0"]
 
 [project.optional-dependencies]
 test = ["pandas ~= 2.0", "typing_extensions ~= 4.0"]