Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/gitingest structure #66

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def main(

if not output:
output = "digest.txt"
summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)

click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
Expand Down
171 changes: 118 additions & 53 deletions src/gitingest/clone.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,144 @@
import asyncio
from typing import Any, Dict, Tuple
from dataclasses import dataclass
from typing import Optional, Tuple

from gitingest.utils import async_timeout
from gitingest.utils import AsyncTimeoutError, async_timeout

CLONE_TIMEOUT = 20


@dataclass
class CloneConfig:
url: str
local_path: str
commit: Optional[str] = None
branch: Optional[str] = None


async def check_repo_exists(url: str) -> bool:
"""
Check if a repository exists at the given URL using an HTTP HEAD request.

Parameters
----------
url : str
The URL of the repository.

Returns
-------
bool
True if the repository exists, False otherwise.
"""
proc = await asyncio.create_subprocess_exec(
"curl",
"-I",
url,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
stdout, _ = await proc.communicate()
if proc.returncode != 0:
return False
# Check if stdout contains "404" status code
stdout_str = stdout.decode()
return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str


@async_timeout(CLONE_TIMEOUT)
async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]:
if not await check_repo_exists(query['url']):
raise ValueError("Repository not found, make sure it is public")
async def run_git_command(*args: str) -> Tuple[bytes, bytes]:
"""
Executes a git command asynchronously and captures its output.

if query['commit']:
proc = await asyncio.create_subprocess_exec(
"git",
"clone",
"--single-branch",
query['url'],
query['local_path'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()

proc = await asyncio.create_subprocess_exec(
"git",
"-C",
query['local_path'],
"checkout",
query['branch'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']:
proc = await asyncio.create_subprocess_exec(
"git",
"clone",
"--depth=1",
"--single-branch",
"--branch",
query['branch'],
query['url'],
query['local_path'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
else:
proc = await asyncio.create_subprocess_exec(
"git",
"clone",
"--depth=1",
"--single-branch",
query['url'],
query['local_path'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
Parameters
----------
*args : str
The git command and its arguments to execute.

Returns
-------
Tuple[bytes, bytes]
A tuple containing the stdout and stderr of the git command.

Raises
------
RuntimeError
If the git command exits with a non-zero status.
"""
proc = await asyncio.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
error_message = stderr.decode().strip()
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")

return stdout, stderr


@async_timeout(CLONE_TIMEOUT)
async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
"""
Clones a repository to a local path based on the provided query parameters.

Parameters
----------
config : CloneConfig
A dictionary containing the following keys:
- url (str): The URL of the repository.
- local_path (str): The local path to clone the repository to.
- commit (Optional[str]): The specific commit hash to checkout.
- branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided.

Returns
-------
Tuple[bytes, bytes]
A tuple containing the stdout and stderr of the git commands executed.

Raises
------
ValueError
If the repository does not exist or if required query parameters are missing.
RuntimeError
If any git command fails during execution.
AsyncTimeoutError
If the cloning process exceeds the specified timeout.
"""
# Extract and validate query parameters
url: str = config.url
local_path: str = config.local_path
commit: Optional[str] = config.commit
branch: Optional[str] = config.branch

if not url:
raise ValueError("The 'url' parameter is required.")

if not local_path:
raise ValueError("The 'local_path' parameter is required.")

# Check if the repository exists
if not await check_repo_exists(url):
raise ValueError("Repository not found, make sure it is public")

try:
if commit:
# Scenario 1: Clone and checkout a specific commit
# Clone the repository without depth to ensure full history for checkout
clone_cmd = ["git", "clone", "--single-branch", url, local_path]
await run_git_command(*clone_cmd)

# Checkout the specific commit
checkout_cmd = ["git", "-C", local_path, "checkout", commit]
return await run_git_command(*checkout_cmd)

if branch and branch.lower() not in ('main', 'master'):
# Scenario 2: Clone a specific branch with shallow depth
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
return await run_git_command(*clone_cmd)

# Scenario 3: Clone the default branch with shallow depth
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path]
return await run_git_command(*clone_cmd)

except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError):
raise # Re-raise the exception
15 changes: 12 additions & 3 deletions src/gitingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from pathlib import Path
from typing import List, Optional, Tuple, Union

from gitingest.clone import clone_repo
from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
from gitingest.parse_query import parse_query


def ingest(
source: str,
max_file_size: int = 10 * 1024 * 1024,
max_file_size: int = 10 * 1024 * 1024, # 10 MB
include_patterns: Union[List[str], str, None] = None,
exclude_patterns: Union[List[str], str, None] = None,
output: Optional[str] = None,
Expand All @@ -25,7 +25,16 @@ def ingest(
ignore_patterns=exclude_patterns,
)
if query['url']:
clone_result = clone_repo(query)

# Extract relevant fields for CloneConfig
clone_config = CloneConfig(
url=query["url"],
local_path=query['local_path'],
commit=query.get('commit'),
branch=query.get('branch'),
)
clone_result = clone_repo(clone_config)

if inspect.iscoroutine(clone_result):
asyncio.run(clone_result)
else:
Expand Down
11 changes: 3 additions & 8 deletions src/gitingest/ingest_from_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str:
return output


def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str:
def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str:
"""Creates a summary string with file counts and content size."""
if "user_name" in query:
summary = f"Repository: {query['user_name']}/{query['repo_name']}\n"
Expand All @@ -297,12 +297,7 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: L
return summary


def create_tree_structure(
query: Dict[str, Any],
node: Dict[str, Any],
prefix: str = "",
is_last: bool = True,
) -> str:
def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str:
"""Creates a tree-like string representation of the file structure."""
tree = ""

Expand Down Expand Up @@ -386,7 +381,7 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]:
if not nodes:
raise ValueError(f"No files found in {path}")
files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size'])
summary = create_summary_string(query, nodes, files)
summary = create_summary_string(query, nodes)
tree = "Directory structure:\n" + create_tree_structure(query, nodes)
files_content = create_file_content_string(files)

Expand Down
Loading
Loading