=== api/.gitignore === /target === api/Cargo.toml === [package] name = "githem-api" description = "Githem API server - Git repository hosting service" version.workspace = true edition.workspace = true authors.workspace = true license.workspace = true repository.workspace = true homepage.workspace = true [[bin]] name = "githem-api" path = "src/main.rs" [features] default = [] rate-limit = ["dep:tower_governor"] [dependencies] githem-core = { version = "0.4.0", path = "../core" } anyhow = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tokio = { workspace = true } tower = { workspace = true } tower-http = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } base64 = { workspace = true } html-escape = { workspace = true } url = { workspace = true } axum = { workspace = true } rand = { workspace = true } # Optional rate limiting tower_governor = { version = "0.7", optional = true } tempfile = "3.8" sha2 = "0.10" === api/src/cache.rs === use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct CachedRepository { pub key: String, pub url: String, pub commit_hash: String, pub result: crate::ingestion::IngestionResult, pub created_at: u64, // unix timestamp pub last_accessed: u64, // unix timestamp pub access_count: u64, pub size_bytes: usize, } pub struct RepositoryCache { cache: Arc>>, max_size: usize, ttl_seconds: u64, metrics: Arc, } impl RepositoryCache { pub fn new( max_size: usize, ttl: Duration, metrics: Arc, ) -> Self { Self { cache: Arc::new(RwLock::new(HashMap::new())), max_size, ttl_seconds: ttl.as_secs(), metrics, } } fn current_timestamp() -> u64 { SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() .as_secs() } pub fn generate_key( url: &str, branch: Option<&str>, preset: Option<&str>, path: Option<&str>, ) -> String { let mut hasher = Sha256::new(); hasher.update(url.as_bytes()); if let Some(branch) = branch { hasher.update(b":"); hasher.update(branch.as_bytes()); } if let Some(preset) = preset { hasher.update(b":"); hasher.update(preset.as_bytes()); } if let Some(path) = path { hasher.update(b":"); hasher.update(path.as_bytes()); } format!("{:x}", hasher.finalize()) } pub async fn get(&self, key: &str) -> Option { let mut cache = self.cache.write().await; let now = Self::current_timestamp(); if let Some(entry) = cache.get_mut(key) { // check ttl if now - entry.created_at > self.ttl_seconds { cache.remove(key); self.metrics.record_cache_miss().await; return None; } entry.last_accessed = now; entry.access_count += 1; self.metrics.record_cache_hit().await; Some(entry.clone()) } else { self.metrics.record_cache_miss().await; None } } pub async fn put( &self, key: String, url: String, commit_hash: String, result: crate::ingestion::IngestionResult, ) { let size_bytes = result.content.len(); let now = Self::current_timestamp(); let entry = CachedRepository { key: key.clone(), url, commit_hash, result, created_at: now, last_accessed: now, access_count: 1, size_bytes, }; let mut cache = self.cache.write().await; // enforce size limit with lru eviction while self.calculate_size(&cache) + size_bytes > self.max_size && !cache.is_empty() { // find least recently used let lru_key = cache .values() .min_by_key(|e| e.last_accessed) .map(|e| e.key.clone()); if let Some(key) = lru_key { cache.remove(&key); } } cache.insert(key, entry); } pub async fn _invalidate(&self, key: &str) { let mut cache = self.cache.write().await; cache.remove(key); } pub async fn _clear(&self) { let mut cache = self.cache.write().await; cache.clear(); } pub async fn stats(&self) -> CacheStats { let cache = self.cache.read().await; CacheStats { entries: cache.len(), total_size: self.calculate_size(&cache), max_size: self.max_size, hit_rate: self.calculate_hit_rate(&cache), top_accessed: self.get_top_accessed(&cache, 10), } } fn calculate_size(&self, cache: &HashMap) -> usize { cache.values().map(|e| e.size_bytes).sum() } fn calculate_hit_rate(&self, cache: &HashMap) -> f64 { let total_accesses: u64 = cache.values().map(|e| e.access_count).sum(); let cache_hits: u64 = cache .values() .map(|e| e.access_count.saturating_sub(1)) .sum(); if total_accesses > 0 { cache_hits as f64 / total_accesses as f64 } else { 0.0 } } fn get_top_accessed( &self, cache: &HashMap, limit: usize, ) -> Vec<(String, u64)> { let mut entries: Vec<_> = cache .values() .map(|e| (e.url.clone(), e.access_count)) .collect(); entries.sort_by(|a, b| b.1.cmp(&a.1)); entries.truncate(limit); entries } } #[derive(Debug, Serialize, Deserialize)] pub struct CacheStats { pub entries: usize, pub total_size: usize, pub max_size: usize, pub hit_rate: f64, pub top_accessed: Vec<(String, u64)>, } === api/src/http.rs === use crate::cache::RepositoryCache; use crate::ingestion::{IngestionParams, IngestionService}; use crate::metrics::MetricsCollector; use githem_core::validate_github_name; use std::sync::Arc; use std::time::{Duration, Instant}; use axum::{ extract::{Path, Query, State}, http::{header, HeaderMap, StatusCode}, response::{Html, IntoResponse, Json, Response}, routing::{get, post}, Router, }; use serde::{Deserialize, Serialize}; use tokio::time::timeout; use tower::ServiceBuilder; use tower_http::{ compression::CompressionLayer, cors::CorsLayer, set_header::SetResponseHeaderLayer, }; const INGEST_TIMEOUT: Duration = Duration::from_secs(300); #[derive(Clone)] pub struct AppState { pub repo_cache: Arc, pub metrics: Arc, } impl Default for AppState { fn default() -> Self { Self::new() } } impl AppState { pub fn new() -> Self { let metrics = Arc::new(MetricsCollector::new()); Self { repo_cache: Arc::new(RepositoryCache::new( 5 * 1024 * 1024 * 1024, // 5GB Duration::from_secs(3600), // 1 hour TTL metrics.clone(), )), metrics, } } } #[derive(Debug, Serialize, Deserialize)] pub struct IngestRequest { pub url: String, pub branch: Option, pub subpath: Option, pub path_prefix: Option, #[serde(default)] pub include_patterns: Vec, #[serde(default)] pub exclude_patterns: Vec, #[serde(default = "default_max_file_size")] pub max_file_size: usize, pub filter_preset: Option, #[serde(default)] pub raw: bool, } fn default_max_file_size() -> usize { 10 * 1024 * 1024 } #[derive(Debug, Serialize, Deserialize)] pub struct IngestResponse { pub id: String, pub status: String, } #[derive(Debug, Serialize, Deserialize)] pub struct ErrorResponse { pub error: String, pub code: String, } #[derive(Debug)] pub enum AppError { InvalidRequest(String), NotFound, Timeout, InternalError(String), } impl IntoResponse for AppError { fn into_response(self) -> axum::response::Response { let (status, error_response) = match self { AppError::InvalidRequest(msg) => ( StatusCode::BAD_REQUEST, ErrorResponse { error: msg, code: "INVALID_REQUEST".to_string(), }, ), AppError::NotFound => ( StatusCode::NOT_FOUND, ErrorResponse { error: "Resource not found".to_string(), code: "NOT_FOUND".to_string(), }, ), AppError::Timeout => ( StatusCode::REQUEST_TIMEOUT, ErrorResponse { error: "Request timed out".to_string(), code: "TIMEOUT".to_string(), }, ), AppError::InternalError(msg) => ( StatusCode::INTERNAL_SERVER_ERROR, ErrorResponse { error: msg, code: "INTERNAL_ERROR".to_string(), }, ), }; (status, Json(error_response)).into_response() } } #[derive(Deserialize)] pub struct QueryParams { pub branch: Option, pub subpath: Option, pub include: Option, pub exclude: Option, pub max_size: Option, pub preset: Option, pub raw: Option, pub path: Option, } // Serve static files async fn serve_static_file(filename: &str) -> Response { let (content, content_type) = match filename { "index.html" | "" => ( include_str!("../../get/web/index.html"), "text/html; charset=utf-8", ), "help.html" => ( include_str!("../../get/web/help.html"), "text/html; charset=utf-8", ), "styles.css" => ( include_str!("../../get/web/styles.css"), "text/css; charset=utf-8", ), "globals.css" => ( include_str!("../../get/web/globals.css"), "text/css; charset=utf-8", ), "install.sh" => ( include_str!("../../get/install.sh"), "text/plain; charset=utf-8", ), "install.ps1" => ( include_str!("../../get/install/install.ps1"), "text/plain; charset=utf-8", ), _ => { return (StatusCode::NOT_FOUND, Html("404 Not Found")).into_response(); } }; Response::builder() .status(StatusCode::OK) .header(header::CONTENT_TYPE, content_type) .body(axum::body::Body::from(content)) .unwrap() } async fn landing_page() -> Response { serve_static_file("index.html").await } async fn help_page() -> Response { serve_static_file("help.html").await } async fn styles_css() -> Response { serve_static_file("styles.css").await } async fn globals_css() -> Response { serve_static_file("globals.css").await } async fn install_sh() -> Response { serve_static_file("install.sh").await } async fn install_ps1() -> Response { serve_static_file("install.ps1").await } async fn health() -> impl IntoResponse { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_secs(); Json(serde_json::json!({ "status": "ok", "timestamp": timestamp, "version": env!("CARGO_PKG_VERSION") })) } async fn ingest_repository( State(state): State, Json(request): Json, ) -> Result { state.metrics.record_request().await; let start = Instant::now(); // Check cache first let cache_key = RepositoryCache::generate_key( &request.url, request.branch.as_deref(), request.filter_preset.as_deref(), request.path_prefix.as_deref(), ); if let Some(cached) = state.repo_cache.get(&cache_key).await { state.metrics.record_response_time(start.elapsed()).await; return Ok(Json(IngestResponse { id: cached.result.id.clone(), status: "completed".to_string(), })); } let params = IngestionParams { url: request.url.clone(), subpath: request.subpath.clone(), branch: request.branch.clone(), path_prefix: request.path_prefix.or(request.subpath), include_patterns: request.include_patterns, exclude_patterns: request.exclude_patterns, max_file_size: request.max_file_size, filter_preset: request.filter_preset.clone(), raw: request.raw, }; let ingestion_result = match timeout(INGEST_TIMEOUT, async { IngestionService::ingest(params).await }) .await { Ok(Ok(result)) => result, Ok(Err(e)) => { state.metrics.record_error().await; return Err(AppError::InternalError(format!("Ingestion failed: {}", e))); } Err(_) => { state.metrics.record_error().await; return Err(AppError::Timeout); } }; // Update metrics state .metrics .record_ingestion( &request.url, ingestion_result.summary.files_analyzed, ingestion_result.summary.total_size as u64, ) .await; // Get commit hash (simplified - would need actual implementation) let commit_hash = ingestion_result.metadata.url.clone(); // Cache the result state .repo_cache .put( cache_key, request.url, commit_hash, ingestion_result.clone(), ) .await; state.metrics.record_response_time(start.elapsed()).await; Ok(Json(IngestResponse { id: ingestion_result.id.clone(), status: "completed".to_string(), })) } async fn get_result( State(state): State, Path(_id): Path, ) -> Result { state.metrics.record_request().await; // Check all cache entries for matching ID // This is a simplified approach - in production you'd want a separate ID index Err::, AppError>(AppError::NotFound) } async fn download_content( State(state): State, Path(_id): Path, ) -> Result { state.metrics.record_request().await; // Similar to get_result but returns as download Err::(AppError::NotFound) } async fn handle_repo( State(state): State, Path((owner, repo)): Path<(String, String)>, Query(params): Query, ) -> Result { ingest_github_repo(state, owner, repo, None, None, params).await } async fn handle_repo_branch( State(state): State, Path((owner, repo, branch)): Path<(String, String, String)>, Query(params): Query, ) -> Result { ingest_github_repo(state, owner, repo, Some(branch), None, params).await } async fn handle_repo_path( State(state): State, Path((owner, repo, branch, path)): Path<(String, String, String, String)>, Query(params): Query, ) -> Result { ingest_github_repo(state, owner, repo, Some(branch), Some(path), params).await } async fn handle_repo_compare( State(state): State, Path((owner, repo, compare_spec)): Path<(String, String, String)>, Query(params): Query, ) -> Result { if !validate_github_name(&owner) || !validate_github_name(&repo) { return Err(AppError::InvalidRequest( "Invalid owner or repo name".to_string(), )); } let (base, head) = parse_compare_spec(&compare_spec).ok_or_else(|| { AppError::InvalidRequest( "Invalid compare format. Use 'base...head' or 'base..head'".to_string(), ) })?; let url = format!("https://github.com/{owner}/{repo}"); state.metrics.record_request().await; let diff_content = timeout(INGEST_TIMEOUT, async { IngestionService::generate_diff( &url, &base, &head, params.include.as_deref(), params.exclude.as_deref(), ) .await }) .await .map_err(|_| AppError::Timeout)? .map_err(|e| AppError::InternalError(format!("Failed to generate diff: {}", e)))?; let mut headers = HeaderMap::new(); headers.insert( "content-type", "text/plain; charset=utf-8" .parse() .map_err(|e| AppError::InternalError(format!("Header parse error: {}", e)))?, ); Ok((headers, diff_content)) } fn parse_compare_spec(spec: &str) -> Option<(String, String)> { if let Some((base, head)) = spec.split_once("...") { if !base.is_empty() && !head.is_empty() { Some((base.to_string(), head.to_string())) } else { None } } else if let Some((base, head)) = spec.split_once("..") { if !base.is_empty() && !head.is_empty() { Some((base.to_string(), head.to_string())) } else { None } } else { None } } async fn ingest_github_repo( state: AppState, owner: String, repo: String, branch: Option, path_prefix: Option, params: QueryParams, ) -> Result { state.metrics.record_request().await; let start = Instant::now(); if !validate_github_name(&owner) || !validate_github_name(&repo) { state.metrics.record_error().await; return Err(AppError::InvalidRequest( "Invalid owner or repo name".to_string(), )); } let url = format!("https://github.com/{owner}/{repo}"); state.metrics.record_request().await; // Check cache let cache_key = RepositoryCache::generate_key( &url, branch.as_deref().or(params.branch.as_deref()), params.preset.as_deref(), path_prefix .as_ref() .or(params.path.as_ref()) .or(params.subpath.as_ref()) .map(|s| s.as_str()), ); if let Some(cached) = state.repo_cache.get(&cache_key).await { state.metrics.record_response_time(start.elapsed()).await; return Ok(cached.result.content); } let ingestion_params = IngestionParams { url: url.clone(), subpath: params.subpath.clone(), branch: branch.or(params.branch), path_prefix: path_prefix .or(params.path.clone()) .or(params.subpath.clone()) .filter(|p| !p.contains("..") && !p.starts_with('/')), include_patterns: params .include .unwrap_or_default() .split(',') .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) .collect(), exclude_patterns: params .exclude .unwrap_or_default() .split(',') .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) .collect(), max_file_size: params.max_size.unwrap_or(10 * 1024 * 1024), filter_preset: params.preset.clone(), raw: params.raw.unwrap_or(false), }; let result = match timeout(INGEST_TIMEOUT, async { IngestionService::ingest(ingestion_params).await }) .await { Ok(Ok(result)) => result, Ok(Err(e)) => { state.metrics.record_error().await; return Err(AppError::InternalError(format!("Ingestion failed: {}", e))); } Err(_) => { state.metrics.record_error().await; return Err(AppError::Timeout); } }; // Update metrics state .metrics .record_ingestion( &url, result.summary.files_analyzed, result.summary.total_size as u64, ) .await; // Cache the result let commit_hash = result.metadata.url.clone(); state .repo_cache .put(cache_key, url, commit_hash, result.clone()) .await; state.metrics.record_response_time(start.elapsed()).await; Ok(result.content) } async fn get_top_repos(State(state): State) -> impl IntoResponse { let repos = state.metrics.get_top_repositories(10).await; Json(repos) } async fn get_metrics(State(state): State) -> impl IntoResponse { let metrics = state.metrics.get_metrics().await; Json(metrics) } async fn get_cache_stats(State(state): State) -> impl IntoResponse { let stats = state.repo_cache.stats().await; Json(stats) } pub fn create_router() -> Router { let state = AppState::new(); let router = Router::new() // Landing page and static assets .route("/", get(landing_page)) .route("/help.html", get(help_page)) .route("/styles.css", get(styles_css)) .route("/globals.css", get(globals_css)) .route("/install.sh", get(install_sh)) .route("/install.ps1", get(install_ps1)) // API endpoints .route("/health", get(health)) .route("/metrics", get(get_metrics)) .route("/api/metrics/top", get(get_top_repos)) .route("/cache/stats", get(get_cache_stats)) .route("/api/ingest", post(ingest_repository)) .route("/api/result/{id}", get(get_result)) .route("/api/download/{id}", get(download_content)) // GitHub repository routes .route("/{owner}/{repo}", get(handle_repo)) .route( "/{owner}/{repo}/compare/{compare_spec}", get(handle_repo_compare), ) .route("/{owner}/{repo}/tree/{branch}", get(handle_repo_branch)) .route( "/{owner}/{repo}/tree/{branch}/{*path}", get(handle_repo_path), ) .with_state(state); router.layer( ServiceBuilder::new() .layer(SetResponseHeaderLayer::overriding( axum::http::header::X_FRAME_OPTIONS, axum::http::HeaderValue::from_static("DENY"), )) .layer(SetResponseHeaderLayer::overriding( axum::http::header::X_CONTENT_TYPE_OPTIONS, axum::http::HeaderValue::from_static("nosniff"), )) .layer(CorsLayer::permissive()) .layer(CompressionLayer::new()), ) } pub async fn serve(addr: std::net::SocketAddr) -> anyhow::Result<()> { let app = create_router(); let listener = tokio::net::TcpListener::bind(addr).await?; println!("HTTP server listening on {addr}"); axum::serve(listener, app).await?; Ok(()) } === api/src/ingestion.rs === use githem_core::{ count_files, estimate_tokens, generate_tree, is_remote_url, normalize_source_url, FilterPreset, FilterStats, IngestOptions, Ingester, IngestionCallback, }; use serde::{Deserialize, Serialize}; use std::path::Path; use std::time::{SystemTime, UNIX_EPOCH}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IngestionParams { pub url: String, pub branch: Option, pub subpath: Option, pub path_prefix: Option, #[serde(default)] pub include_patterns: Vec, #[serde(default)] pub exclude_patterns: Vec, #[serde(default = "default_max_file_size")] pub max_file_size: usize, pub filter_preset: Option, #[serde(default)] pub raw: bool, } fn default_max_file_size() -> usize { 10 * 1024 * 1024 } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IngestionResult { pub id: String, pub summary: IngestionSummary, pub tree: String, pub content: String, pub metadata: RepositoryMetadata, pub filter_stats: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IngestionSummary { pub repository: String, pub branch: String, pub subpath: Option, pub files_analyzed: usize, pub total_size: usize, pub estimated_tokens: usize, pub filter_preset: String, pub filtering_enabled: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RepositoryMetadata { pub url: String, pub default_branch: String, pub branches: Vec, pub size: Option, } pub struct IngestionService; impl IngestionService { pub async fn ingest( params: IngestionParams, ) -> Result> { let params = Self::normalize_params(params)?; let filter_preset = if params.raw { Some(FilterPreset::Raw) } else if let Some(preset) = Self::parse_filter_preset(params.filter_preset.as_deref()) { Some(preset) } else { Some(FilterPreset::Standard) }; let filter_preset_name = match filter_preset { Some(FilterPreset::Raw) => "raw", Some(FilterPreset::Standard) => "standard", Some(FilterPreset::CodeOnly) => "code-only", Some(FilterPreset::Minimal) => "minimal", None => "none", }; let options = IngestOptions { include_patterns: params.include_patterns.clone(), exclude_patterns: params.exclude_patterns.clone(), max_file_size: params.max_file_size, include_untracked: false, branch: params.branch.clone(), path_prefix: params.path_prefix.clone(), filter_preset, apply_default_filters: false, }; let mut ingester = if is_remote_url(¶ms.url) { Ingester::from_url_cached(¶ms.url, options)? } else { let path = std::path::PathBuf::from(¶ms.url); Ingester::from_path(&path, options)? }; let filter_stats = ingester.get_filter_stats().ok(); let mut content = Vec::new(); if ingester.cache_key.is_some() { ingester.ingest_cached(&mut content)?; } else { ingester.ingest(&mut content)?; } let content_str = String::from_utf8(content)?; let id = format!( "{}-{}", SystemTime::now().duration_since(UNIX_EPOCH)?.as_millis(), rand::random::() ); let tree = generate_tree(&content_str); let files_analyzed = count_files(&content_str); let total_size = content_str.len(); let estimated_tokens = estimate_tokens(&content_str); let summary = IngestionSummary { repository: params.url.clone(), branch: params.branch.unwrap_or_else(|| "main".to_string()), subpath: params.path_prefix.clone(), files_analyzed, total_size, estimated_tokens, filter_preset: filter_preset_name.to_string(), filtering_enabled: filter_preset != Some(FilterPreset::Raw), }; let metadata = RepositoryMetadata { url: params.url, default_branch: "main".to_string(), branches: vec!["main".to_string()], size: Some(total_size as u64), }; Ok(IngestionResult { id, summary, tree, content: content_str, metadata, filter_stats, }) } pub fn normalize_params(params: IngestionParams) -> Result { if params.url.is_empty() { return Err("URL is required".to_string()); } let (normalized_url, final_branch, final_path_prefix) = normalize_source_url( ¶ms.url, params.branch.clone(), params.path_prefix.clone(), )?; if !is_remote_url(&normalized_url) && !std::path::Path::new(&normalized_url).exists() { return Err("Invalid URL or path".to_string()); } Ok(IngestionParams { url: normalized_url, subpath: params.subpath, branch: final_branch, path_prefix: final_path_prefix, include_patterns: params.include_patterns, exclude_patterns: params.exclude_patterns, max_file_size: params.max_file_size, filter_preset: params.filter_preset, raw: params.raw, }) } pub fn parse_filter_preset(preset_str: Option<&str>) -> Option { preset_str.and_then(|s| match s.to_lowercase().as_str() { "raw" => Some(FilterPreset::Raw), "standard" => Some(FilterPreset::Standard), "code-only" | "code_only" | "codeonly" => Some(FilterPreset::CodeOnly), "minimal" => Some(FilterPreset::Minimal), _ => None, }) } pub async fn generate_diff( url: &str, base: &str, head: &str, _include_patterns: Option<&str>, _exclude_patterns: Option<&str>, ) -> Result> { let options = IngestOptions::default(); let ingester = if is_remote_url(url) { Ingester::from_url(url, options)? } else { return Err("Diff generation requires a remote URL".into()); }; let diff_content = ingester.generate_diff(base, head)?; Ok(diff_content) } } #[allow(dead_code)] pub struct WebSocketCallback where F: FnMut(WebSocketMessage), { pub send_fn: F, } #[derive(Debug, Serialize)] #[serde(tag = "type")] pub enum WebSocketMessage { Progress { stage: String, message: String }, File { path: String, content: String }, Complete { files: usize, bytes: usize }, Error { message: String }, FilterStats { stats: FilterStats }, } impl IngestionCallback for WebSocketCallback where F: FnMut(WebSocketMessage) + Send + Sync, { fn on_progress(&mut self, stage: &str, message: &str) { (self.send_fn)(WebSocketMessage::Progress { stage: stage.to_string(), message: message.to_string(), }); } fn on_file(&mut self, path: &Path, content: &str) { (self.send_fn)(WebSocketMessage::File { path: path.display().to_string(), content: content.to_string(), }); } fn on_complete(&mut self, files: usize, bytes: usize) { (self.send_fn)(WebSocketMessage::Complete { files, bytes }); } fn on_error(&mut self, error: &str) { (self.send_fn)(WebSocketMessage::Error { message: error.to_string(), }); } } === api/src/lib.rs === pub mod cache; pub mod http; pub mod ingestion; pub mod metrics; pub mod websocket; === api/src/main.rs === mod cache; mod http; mod ingestion; mod metrics; mod websocket; use anyhow::Result; use std::net::SocketAddr; use tracing::info; #[tokio::main] async fn main() -> Result<()> { tracing_subscriber::fmt() .with_env_filter( tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| "githem_api=info,tower_http=info".into()), ) .init(); let http_port = std::env::var("HTTP_PORT") .ok() .and_then(|p| p.parse().ok()) .unwrap_or(42069); let http_addr = SocketAddr::from(([0, 0, 0, 0], http_port)); let ws_port = std::env::var("WS_PORT") .ok() .and_then(|p| p.parse().ok()) .unwrap_or(42070); let ws_addr = SocketAddr::from(([0, 0, 0, 0], ws_port)); info!("Starting githem-api HTTP on http://{}", http_addr); info!("Starting githem-api WebSocket on ws://{}", ws_addr); tokio::try_join!(http::serve(http_addr), websocket::serve(ws_addr))?; Ok(()) } === api/src/metrics.rs === use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; use tokio::sync::RwLock; #[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct Metrics { pub total_requests: u64, pub total_ingestions: u64, pub cache_hits: u64, pub cache_misses: u64, pub total_bytes_processed: u64, pub total_files_processed: u64, pub average_response_time_ms: u64, pub errors: u64, pub repositories: HashMap, pub hourly_stats: Vec, } #[derive(Clone, Debug, Serialize, Deserialize)] pub struct RepoMetrics { pub url: String, pub request_count: u64, pub last_accessed: u64, pub size_bytes: u64, pub file_count: usize, } #[derive(Clone, Debug, Serialize, Deserialize)] pub struct HourlyStats { pub hour: u64, pub requests: u64, pub cache_hits: u64, pub bytes: u64, } pub struct MetricsCollector { metrics: Arc>, response_times: Arc>>, } impl Default for MetricsCollector { fn default() -> Self { Self::new() } } impl MetricsCollector { pub fn new() -> Self { Self { metrics: Arc::new(RwLock::new(Metrics::default())), response_times: Arc::new(RwLock::new(Vec::new())), } } pub async fn record_request(&self) { let mut metrics = self.metrics.write().await; metrics.total_requests += 1; } pub async fn record_ingestion(&self, repo_url: &str, files: usize, bytes: u64) { let mut metrics = self.metrics.write().await; metrics.total_ingestions += 1; metrics.total_files_processed += files as u64; metrics.total_bytes_processed += bytes; let now = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_secs(); // get existing request count before updating let existing_count = metrics .repositories .get(repo_url) .map(|r| r.request_count) .unwrap_or(0); metrics.repositories.insert( repo_url.to_string(), RepoMetrics { url: repo_url.to_string(), request_count: existing_count + 1, last_accessed: now, size_bytes: bytes, file_count: files, }, ); // update hourly stats let hour = now / 3600; if let Some(stat) = metrics.hourly_stats.iter_mut().find(|s| s.hour == hour) { stat.requests += 1; stat.bytes += bytes; } else { metrics.hourly_stats.push(HourlyStats { hour, requests: 1, cache_hits: 0, bytes, }); } // keep only last 24 hours let cutoff = hour.saturating_sub(24); metrics.hourly_stats.retain(|s| s.hour > cutoff); } pub async fn record_cache_hit(&self) { let mut metrics = self.metrics.write().await; metrics.cache_hits += 1; let hour = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_secs() / 3600; if let Some(stat) = metrics.hourly_stats.iter_mut().find(|s| s.hour == hour) { stat.cache_hits += 1; } } pub async fn record_cache_miss(&self) { let mut metrics = self.metrics.write().await; metrics.cache_misses += 1; } pub async fn record_error(&self) { let mut metrics = self.metrics.write().await; metrics.errors += 1; } pub async fn record_response_time(&self, duration: Duration) { let mut times = self.response_times.write().await; times.push(duration); // keep only last 1000 response times if times.len() > 1000 { let excess = times.len() - 1000; times.drain(0..excess); } // update average if !times.is_empty() { let avg_ms = times.iter().map(|d| d.as_millis() as u64).sum::() / times.len() as u64; let mut metrics = self.metrics.write().await; metrics.average_response_time_ms = avg_ms; } } pub async fn get_metrics(&self) -> Metrics { self.metrics.read().await.clone() } pub async fn get_top_repositories(&self, limit: usize) -> Vec { let metrics = self.metrics.read().await; let mut repos: Vec<_> = metrics.repositories.values().cloned().collect(); repos.sort_by(|a, b| b.request_count.cmp(&a.request_count)); repos.truncate(limit); repos } } === api/src/websocket.rs === use crate::ingestion::{IngestionParams, IngestionService, WebSocketMessage}; use anyhow::Result; use axum::{ extract::{ ws::{Message, WebSocket, WebSocketUpgrade}, Query, }, response::IntoResponse, routing::get, Router, }; use serde::Deserialize; use std::net::SocketAddr; use std::time::Instant; use tracing::{error, info}; #[derive(Debug, Deserialize)] struct WsQuery { url: String, #[serde(default)] include: Vec, #[serde(default)] exclude: Vec, #[serde(default = "default_max_size")] max_size: usize, #[serde(default)] branch: Option, #[serde(default)] preset: Option, #[serde(default)] raw: bool, } fn default_max_size() -> usize { 10 * 1024 * 1024 } async fn websocket_handler( ws: WebSocketUpgrade, Query(params): Query, ) -> impl IntoResponse { ws.on_upgrade(move |socket| handle_socket(socket, params)) } async fn handle_socket(mut socket: WebSocket, params: WsQuery) { let _start = Instant::now(); if let Err(e) = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::Progress { stage: "starting".to_string(), message: format!("Processing {}", params.url), }) .unwrap() .into(), )) .await { error!("Failed to send message: {}", e); return; } let ingestion_params = IngestionParams { url: params.url.clone(), subpath: None, branch: params.branch, path_prefix: None, include_patterns: params.include, exclude_patterns: params.exclude, max_file_size: params.max_size, filter_preset: params.preset, raw: params.raw, }; if let Err(e) = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::Progress { stage: "cloning".to_string(), message: "Cloning repository...".to_string(), }) .unwrap() .into(), )) .await { error!("Failed to send message: {}", e); return; } match IngestionService::ingest(ingestion_params).await { Ok(result) => { if let Err(e) = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::Progress { stage: "ingesting".to_string(), message: "Processing files...".to_string(), }) .unwrap() .into(), )) .await { error!("Failed to send message: {}", e); return; } // Send filter stats if available if let Some(stats) = &result.filter_stats { let _ = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::FilterStats { stats: stats.clone(), }) .unwrap() .into(), )) .await; } let _ = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::File { path: "all_files.txt".to_string(), content: result.content, }) .unwrap() .into(), )) .await; let _ = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::Complete { files: result.summary.files_analyzed, bytes: result.summary.total_size, }) .unwrap() .into(), )) .await; info!("WebSocket session completed for {}", params.url); } Err(e) => { let _ = socket .send(Message::Text( serde_json::to_string(&WebSocketMessage::Error { message: format!("Failed: {e}"), }) .unwrap() .into(), )) .await; } } } pub async fn serve(addr: SocketAddr) -> Result<()> { let app = Router::new().route("/", get(websocket_handler)); let listener = tokio::net::TcpListener::bind(addr).await?; axum::serve(listener, app).await?; Ok(()) } === core/Cargo.toml === [package] name = "githem-core" description = "Core library for Githem - Git repository print for LLM ready text" version.workspace = true edition.workspace = true authors.workspace = true license.workspace = true repository.workspace = true homepage.workspace = true [dependencies] anyhow = { workspace = true } git2 = { workspace = true } uuid = { workspace = true } serde = { workspace = true } libc = { workspace = true } walkdir = { workspace = true } sha2 = "0.10" bincode = "1.3" serde_json = { workspace = true } === core/src/cache.rs === use anyhow::Result; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CacheEntry { pub repo_url: String, pub branch: String, pub commit_hash: String, pub files: Vec, pub metadata: CacheMetadata, pub created_at: u64, pub last_accessed: u64, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CachedFile { pub path: PathBuf, pub content: Vec, pub size: u64, pub is_binary: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CacheMetadata { pub total_files: usize, pub total_size: u64, pub tree_hash: String, pub cache_version: String, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CacheStats { pub total_entries: usize, pub total_size: u64, pub max_size: u64, pub expired_entries: usize, pub cache_dir: PathBuf, } #[derive(Debug, Clone, Serialize, Deserialize)] struct CacheIndex { pub entries: HashMap, } #[derive(Debug, Clone, Serialize, Deserialize)] struct CacheEntryInfo { pub key: String, pub path: PathBuf, pub size: u64, pub created_at: u64, pub last_accessed: u64, pub commit_hash: String, } pub struct RepositoryCache { cache_dir: PathBuf, index: HashMap, max_cache_size: u64, max_age_seconds: u64, } impl RepositoryCache { pub fn new() -> Result { Self::with_config(5 * 1024 * 1024 * 1024, 7 * 24 * 3600) } pub fn with_config(max_size: u64, max_age_seconds: u64) -> Result { let cache_dir = Self::get_cache_dir()?; fs::create_dir_all(&cache_dir)?; let index = Self::load_index(&cache_dir).unwrap_or_default(); Ok(Self { cache_dir, index, max_cache_size: max_size, max_age_seconds, }) } fn get_cache_dir() -> Result { let cache_dir = if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") { PathBuf::from(xdg_cache).join("githem") } else if let Ok(home) = std::env::var("HOME") { PathBuf::from(home).join(".cache").join("githem") } else { PathBuf::from("/tmp/githem-cache") }; Ok(cache_dir) } pub fn generate_cache_key(url: &str, branch: Option<&str>) -> String { let mut hasher = Sha256::new(); hasher.update(url.as_bytes()); if let Some(branch) = branch { hasher.update(b":"); hasher.update(branch.as_bytes()); } format!("{:x}", hasher.finalize()) } pub fn get(&mut self, key: &str) -> Result> { if let Some(info) = self.index.get_mut(key) { let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(); if now - info.created_at > self.max_age_seconds { self.remove(key)?; return Ok(None); } info.last_accessed = now; let cache_path = &info.path; if cache_path.exists() { let data = fs::read(cache_path)?; let entry: CacheEntry = bincode::deserialize(&data)?; self.save_index()?; return Ok(Some(entry)); } } Ok(None) } pub fn put(&mut self, key: String, entry: CacheEntry) -> Result<()> { let serialized = bincode::serialize(&entry)?; let entry_size = serialized.len() as u64; self.evict_if_needed(entry_size)?; let cache_file = self.cache_dir.join(format!("{}.cache", key)); fs::write(&cache_file, serialized)?; let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(); self.index.insert( key.clone(), CacheEntryInfo { key, path: cache_file, size: entry_size, created_at: now, last_accessed: now, commit_hash: entry.commit_hash.clone(), }, ); self.save_index()?; Ok(()) } pub fn check_commit(&self, key: &str, current_commit: &str) -> CacheCommitStatus { if let Some(info) = self.index.get(key) { if info.commit_hash == current_commit { CacheCommitStatus::Match } else { CacheCommitStatus::Outdated } } else { CacheCommitStatus::NotCached } } pub fn remove(&mut self, key: &str) -> Result<()> { if let Some(info) = self.index.remove(key) { if info.path.exists() { fs::remove_file(info.path)?; } self.save_index()?; } Ok(()) } fn evict_if_needed(&mut self, new_entry_size: u64) -> Result<()> { let total_size: u64 = self.index.values().map(|e| e.size).sum(); if total_size + new_entry_size <= self.max_cache_size { return Ok(()); } let mut entries: Vec<_> = self.index.values().cloned().collect(); entries.sort_by_key(|e| e.last_accessed); let mut freed_space = 0u64; for entry in entries { if total_size - freed_space + new_entry_size <= self.max_cache_size { break; } freed_space += entry.size; self.remove(&entry.key)?; } Ok(()) } fn load_index(cache_dir: &Path) -> Result> { let index_path = cache_dir.join("index.json"); if index_path.exists() { let data = fs::read_to_string(index_path)?; let index: CacheIndex = serde_json::from_str(&data)?; Ok(index.entries) } else { Ok(HashMap::new()) } } fn save_index(&self) -> Result<()> { let index_path = self.cache_dir.join("index.json"); let index = CacheIndex { entries: self.index.clone(), }; let data = serde_json::to_string_pretty(&index)?; fs::write(index_path, data)?; Ok(()) } pub fn clear_all(&mut self) -> Result<()> { for key in self.index.keys().cloned().collect::>() { self.remove(&key)?; } Ok(()) } pub fn get_stats(&self) -> CacheStats { let total_size: u64 = self.index.values().map(|e| e.size).sum(); let now = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() .as_secs(); let expired_count = self .index .values() .filter(|e| now - e.created_at > self.max_age_seconds) .count(); CacheStats { total_entries: self.index.len(), total_size, max_size: self.max_cache_size, expired_entries: expired_count, cache_dir: self.cache_dir.clone(), } } } #[derive(Debug, PartialEq)] pub enum CacheCommitStatus { Match, Outdated, NotCached, } pub struct CacheManager; impl CacheManager { pub fn clear_cache() -> Result<()> { let mut cache = RepositoryCache::new()?; cache.clear_all()?; Ok(()) } pub fn get_stats() -> Result { let cache = RepositoryCache::new()?; Ok(cache.get_stats()) } } === core/src/filtering.rs === // core/src/filtering.rs use serde::{Deserialize, Serialize}; /// Centralized filtering configuration for githem #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct FilterConfig { /// Default exclude patterns applied unless raw mode is used pub default_excludes: Vec, /// Categories of files for selective filtering pub categories: FilterCategories, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FilterCategories { pub lock_files: Vec, pub dependencies: Vec, pub build_artifacts: Vec, pub ide_files: Vec, pub media_files: Vec, pub binary_files: Vec, pub documents: Vec, pub data_files: Vec, pub fonts: Vec, pub logs: Vec, pub cache: Vec, pub os_files: Vec, pub version_control: Vec, pub secrets: Vec, } impl Default for FilterCategories { fn default() -> Self { Self { lock_files: vec![ "*.lock".to_string(), "Cargo.lock".to_string(), "package-lock.json".to_string(), "yarn.lock".to_string(), "pnpm-lock.yaml".to_string(), "bun.lockb".to_string(), "composer.lock".to_string(), "Pipfile.lock".to_string(), "poetry.lock".to_string(), "Gemfile.lock".to_string(), "go.sum".to_string(), "mix.lock".to_string(), "pubspec.lock".to_string(), "packages-lock.json".to_string(), // Unity "vcpkg.json".to_string(), ], dependencies: vec![ "node_modules/*".to_string(), "vendor/*".to_string(), "target/*".to_string(), ".cargo/*".to_string(), "__pycache__/*".to_string(), ".venv/*".to_string(), "venv/*".to_string(), "env/*".to_string(), "site-packages/*".to_string(), "gems/*".to_string(), "bower_components/*".to_string(), "jspm_packages/*".to_string(), ".pub-cache/*".to_string(), "Packages/*".to_string(), // Unity "Library/*".to_string(), // Unity "obj/*".to_string(), // .NET "bin/*".to_string(), // .NET "pkg/*".to_string(), // Go "_build/*".to_string(), // Elixir "deps/*".to_string(), // Elixir ], build_artifacts: vec![ "dist/*".to_string(), "build/*".to_string(), "out/*".to_string(), ".next/*".to_string(), ".nuxt/*".to_string(), ".svelte-kit/*".to_string(), ".output/*".to_string(), "coverage/*".to_string(), ".nyc_output/*".to_string(), "*.tsbuildinfo".to_string(), "*.buildlog".to_string(), "cmake-build-*/*".to_string(), "Release/*".to_string(), "Debug/*".to_string(), "x64/*".to_string(), "x86/*".to_string(), ".gradle/*".to_string(), "gradle/*".to_string(), "*.class".to_string(), "*.o".to_string(), "*.a".to_string(), "*.obj".to_string(), "*.lib".to_string(), "*.exp".to_string(), "*.pdb".to_string(), "*.ilk".to_string(), ], ide_files: vec![ ".vscode/*".to_string(), ".idea/*".to_string(), "*.swp".to_string(), "*.swo".to_string(), "*~".to_string(), ".DS_Store".to_string(), "Thumbs.db".to_string(), "*.tmp".to_string(), ".vs/*".to_string(), "*.vcxproj.user".to_string(), "*.suo".to_string(), "*.user".to_string(), ".vimrc.local".to_string(), ".sublime-*".to_string(), "*.sublime-workspace".to_string(), ".fleet/*".to_string(), ".zed/*".to_string(), ], media_files: vec![ // Images "*.png".to_string(), "*.jpg".to_string(), "*.jpeg".to_string(), "*.gif".to_string(), "*.bmp".to_string(), "*.tiff".to_string(), "*.tga".to_string(), "*.ico".to_string(), "*.svg".to_string(), "*.webp".to_string(), "*.avif".to_string(), "*.heic".to_string(), "*.raw".to_string(), "*.psd".to_string(), "*.ai".to_string(), "*.eps".to_string(), // Videos "*.mp4".to_string(), "*.avi".to_string(), "*.mov".to_string(), "*.wmv".to_string(), "*.flv".to_string(), "*.webm".to_string(), "*.mkv".to_string(), "*.m4v".to_string(), "*.3gp".to_string(), "*.asf".to_string(), // Audio "*.mp3".to_string(), "*.wav".to_string(), "*.flac".to_string(), "*.aac".to_string(), "*.ogg".to_string(), "*.wma".to_string(), "*.m4a".to_string(), "*.opus".to_string(), ], binary_files: vec![ "*.zip".to_string(), "*.tar".to_string(), "*.gz".to_string(), "*.bz2".to_string(), "*.xz".to_string(), "*.rar".to_string(), "*.7z".to_string(), "*.dmg".to_string(), "*.iso".to_string(), "*.exe".to_string(), "*.msi".to_string(), "*.app".to_string(), "*.deb".to_string(), "*.rpm".to_string(), "*.pkg".to_string(), "*.dll".to_string(), "*.so".to_string(), "*.dylib".to_string(), "*.bin".to_string(), "*.dat".to_string(), "*.img".to_string(), ], documents: vec![ "*.pdf".to_string(), "*.doc".to_string(), "*.docx".to_string(), "*.xls".to_string(), "*.xlsx".to_string(), "*.ppt".to_string(), "*.pptx".to_string(), "*.odt".to_string(), "*.ods".to_string(), "*.odp".to_string(), "*.rtf".to_string(), "*.pages".to_string(), "*.numbers".to_string(), "*.keynote".to_string(), ], data_files: vec![ "*.db".to_string(), "*.sqlite".to_string(), "*.sqlite3".to_string(), "*.db3".to_string(), "*.dump".to_string(), "*.sql".to_string(), "*.bak".to_string(), "*.mdb".to_string(), "*.accdb".to_string(), // Large structured data (configurable) "*.csv".to_string(), "*.json".to_string(), "*.xml".to_string(), "*.yaml".to_string(), "*.yml".to_string(), "*.parquet".to_string(), "*.arrow".to_string(), "*.avro".to_string(), ], fonts: vec![ "*.ttf".to_string(), "*.otf".to_string(), "*.woff".to_string(), "*.woff2".to_string(), "*.eot".to_string(), "*.pfb".to_string(), "*.pfm".to_string(), "*.afm".to_string(), "*.fon".to_string(), "*.fnt".to_string(), ], logs: vec![ "*.log".to_string(), "logs/*".to_string(), "log/*".to_string(), "*.out".to_string(), "*.err".to_string(), "nohup.out".to_string(), "*.trace".to_string(), "*.pid".to_string(), ], cache: vec![ ".cache/*".to_string(), "cache/*".to_string(), ".temp/*".to_string(), "temp/*".to_string(), "tmp/*".to_string(), ".tmp/*".to_string(), "*.cache".to_string(), ".parcel-cache/*".to_string(), ".turbo/*".to_string(), ".swc/*".to_string(), ".eslintcache".to_string(), ".stylelintcache".to_string(), ".prettiercache".to_string(), "*.tsbuildinfo".to_string(), ".rollup.cache/*".to_string(), ], os_files: vec![ ".DS_Store".to_string(), ".AppleDouble".to_string(), ".LSOverride".to_string(), "._*".to_string(), ".DocumentRevisions-V100".to_string(), ".fseventsd".to_string(), ".Spotlight-V100".to_string(), ".TemporaryItems".to_string(), ".Trashes".to_string(), ".VolumeIcon.icns".to_string(), ".com.apple.timemachine.donotpresent".to_string(), ".AppleDB".to_string(), ".AppleDesktop".to_string(), "Network Trash Folder".to_string(), "Temporary Items".to_string(), ".apdisk".to_string(), "Thumbs.db".to_string(), "Thumbs.db:encryptable".to_string(), "ehthumbs.db".to_string(), "ehthumbs_vista.db".to_string(), "*.stackdump".to_string(), "[Dd]esktop.ini".to_string(), "$RECYCLE.BIN/*".to_string(), "*.cab".to_string(), "*.lnk".to_string(), ], version_control: vec![ ".git/*".to_string(), ".svn/*".to_string(), ".hg/*".to_string(), ".bzr/*".to_string(), "_darcs/*".to_string(), ".pijul/*".to_string(), "CVS/*".to_string(), ".cvs/*".to_string(), "SCCS/*".to_string(), "RCS/*".to_string(), ".gitignore_global".to_string(), ".gitkeep".to_string(), ".gitattributes_global".to_string(), ], secrets: vec![ ".env".to_string(), ".env.local".to_string(), ".env.*.local".to_string(), ".env.production".to_string(), ".env.development".to_string(), ".env.staging".to_string(), ".env.test".to_string(), "*.key".to_string(), "*.pem".to_string(), "*.crt".to_string(), "*.cert".to_string(), "*.p12".to_string(), "*.pfx".to_string(), "*.jks".to_string(), "*.keystore".to_string(), "id_rsa".to_string(), "id_dsa".to_string(), "id_ecdsa".to_string(), "id_ed25519".to_string(), "*.ppk".to_string(), ".ssh/*".to_string(), "credentials".to_string(), "secrets.json".to_string(), "config.json".to_string(), // Often contains secrets ".aws/*".to_string(), ".azure/*".to_string(), ".gcloud/*".to_string(), ], } } } /// Filter preset configurations #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum FilterPreset { /// No filtering - include everything Raw, /// Standard filtering for LLM analysis (default) Standard, /// Only source code and documentation CodeOnly, /// Minimal filtering - just exclude obvious binary/large files Minimal, } impl FilterConfig { /// Get the default filter configuration pub fn new() -> Self { let mut config = Self::default(); config.build_default_excludes(); config } /// Build the default excludes from all categories fn build_default_excludes(&mut self) { let mut excludes = Vec::new(); excludes.extend(self.categories.lock_files.clone()); excludes.extend(self.categories.dependencies.clone()); excludes.extend(self.categories.build_artifacts.clone()); excludes.extend(self.categories.ide_files.clone()); excludes.extend(self.categories.media_files.clone()); excludes.extend(self.categories.binary_files.clone()); excludes.extend(self.categories.documents.clone()); excludes.extend(self.categories.data_files.clone()); excludes.extend(self.categories.fonts.clone()); excludes.extend(self.categories.logs.clone()); excludes.extend(self.categories.cache.clone()); excludes.extend(self.categories.os_files.clone()); excludes.extend(self.categories.version_control.clone()); excludes.extend(self.categories.secrets.clone()); // Remove duplicates excludes.sort(); excludes.dedup(); self.default_excludes = excludes; } /// Get excludes for a specific preset pub fn get_excludes_for_preset(&self, preset: FilterPreset) -> Vec { match preset { FilterPreset::Raw => Vec::new(), FilterPreset::Standard => self.default_excludes.clone(), FilterPreset::CodeOnly => { let mut excludes = Vec::new(); excludes.extend(self.categories.lock_files.clone()); excludes.extend(self.categories.dependencies.clone()); excludes.extend(self.categories.build_artifacts.clone()); excludes.extend(self.categories.ide_files.clone()); excludes.extend(self.categories.media_files.clone()); excludes.extend(self.categories.binary_files.clone()); excludes.extend(self.categories.documents.clone()); excludes.extend(self.categories.data_files.clone()); excludes.extend(self.categories.fonts.clone()); excludes.extend(self.categories.logs.clone()); excludes.extend(self.categories.cache.clone()); excludes.extend(self.categories.os_files.clone()); excludes.extend(self.categories.version_control.clone()); excludes.extend(self.categories.secrets.clone()); // For code-only, also exclude common non-code files excludes.extend(vec![ "*.md".to_string(), "*.txt".to_string(), "*.rst".to_string(), "LICENSE*".to_string(), "CHANGELOG*".to_string(), "README*".to_string(), "CONTRIBUTING*".to_string(), "AUTHORS*".to_string(), "CREDITS*".to_string(), "NOTICE*".to_string(), ]); excludes } FilterPreset::Minimal => { let mut excludes = Vec::new(); excludes.extend(self.categories.media_files.clone()); excludes.extend(self.categories.binary_files.clone()); excludes.extend(self.categories.documents.clone()); excludes.extend(self.categories.fonts.clone()); excludes.extend(self.categories.version_control.clone()); excludes.extend(self.categories.secrets.clone()); excludes } } } /// Get excludes for specific categories pub fn get_excludes_for_categories(&self, categories: &[&str]) -> Vec { let mut excludes = Vec::new(); for category in categories { match *category { "lock_files" => excludes.extend(self.categories.lock_files.clone()), "dependencies" => excludes.extend(self.categories.dependencies.clone()), "build_artifacts" => excludes.extend(self.categories.build_artifacts.clone()), "ide_files" => excludes.extend(self.categories.ide_files.clone()), "media_files" => excludes.extend(self.categories.media_files.clone()), "binary_files" => excludes.extend(self.categories.binary_files.clone()), "documents" => excludes.extend(self.categories.documents.clone()), "data_files" => excludes.extend(self.categories.data_files.clone()), "fonts" => excludes.extend(self.categories.fonts.clone()), "logs" => excludes.extend(self.categories.logs.clone()), "cache" => excludes.extend(self.categories.cache.clone()), "os_files" => excludes.extend(self.categories.os_files.clone()), "version_control" => excludes.extend(self.categories.version_control.clone()), "secrets" => excludes.extend(self.categories.secrets.clone()), _ => {} // Unknown category, skip } } // Remove duplicates excludes.sort(); excludes.dedup(); excludes } /// Check if a pattern is in the default excludes pub fn is_excluded_by_default(&self, pattern: &str) -> bool { self.default_excludes.contains(&pattern.to_string()) } /// Get all available category names pub fn get_category_names(&self) -> Vec<&'static str> { vec![ "lock_files", "dependencies", "build_artifacts", "ide_files", "media_files", "binary_files", "documents", "data_files", "fonts", "logs", "cache", "os_files", "version_control", "secrets", ] } /// Create a custom configuration from existing config pub fn with_custom_excludes(&self, additional_excludes: Vec) -> Self { let mut config = self.clone(); config.default_excludes.extend(additional_excludes); config.default_excludes.sort(); config.default_excludes.dedup(); config } } /// Helper function to get default excludes (for backward compatibility) pub fn get_default_excludes() -> Vec { FilterConfig::new().default_excludes } /// Helper function to get excludes for a preset pub fn get_excludes_for_preset(preset: FilterPreset) -> Vec { FilterConfig::new().get_excludes_for_preset(preset) } #[cfg(test)] mod tests { use super::*; #[test] fn test_default_config() { let config = FilterConfig::new(); assert!(!config.default_excludes.is_empty()); assert!(config.default_excludes.contains(&"*.lock".to_string())); assert!(config .default_excludes .contains(&"node_modules/*".to_string())); } #[test] fn test_presets() { let config = FilterConfig::new(); let raw = config.get_excludes_for_preset(FilterPreset::Raw); assert!(raw.is_empty()); let standard = config.get_excludes_for_preset(FilterPreset::Standard); assert!(!standard.is_empty()); let minimal = config.get_excludes_for_preset(FilterPreset::Minimal); assert!(!minimal.is_empty()); assert!(minimal.len() < standard.len()); let code_only = config.get_excludes_for_preset(FilterPreset::CodeOnly); assert!(!code_only.is_empty()); assert!(code_only.contains(&"*.md".to_string())); } #[test] fn test_categories() { let config = FilterConfig::new(); let media_excludes = config.get_excludes_for_categories(&["media_files"]); assert!(media_excludes.contains(&"*.png".to_string())); assert!(media_excludes.contains(&"*.mp4".to_string())); let multiple = config.get_excludes_for_categories(&["lock_files", "cache"]); assert!(multiple.contains(&"*.lock".to_string())); assert!(multiple.contains(&".cache/*".to_string())); } #[test] fn test_serialization() { let config = FilterConfig::new(); // Test that the config can be created and used assert!(!config.default_excludes.is_empty()); assert!(config.get_category_names().contains(&"lock_files")); } } === core/src/ingester.rs === use crate::{cache::*, clone_repository, glob_match, RepositoryMetadata}; use anyhow::{Context, Result}; use git2::{Repository, Status, StatusOptions}; use serde::{Deserialize, Serialize}; use std::io::Write; use std::path::{Path, PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IngestOptions { pub include_patterns: Vec, pub exclude_patterns: Vec, pub max_file_size: usize, pub include_untracked: bool, pub branch: Option, pub path_prefix: Option, pub filter_preset: Option, pub apply_default_filters: bool, } impl Default for IngestOptions { fn default() -> Self { Self { include_patterns: Vec::new(), exclude_patterns: Vec::new(), max_file_size: 1048576, include_untracked: false, branch: None, path_prefix: None, filter_preset: None, apply_default_filters: true, } } } impl IngestOptions { pub fn with_preset(preset: crate::FilterPreset) -> Self { Self { filter_preset: Some(preset), apply_default_filters: false, ..Default::default() } } pub fn get_effective_excludes(&self) -> Vec { let mut excludes = self.exclude_patterns.clone(); if let Some(preset) = self.filter_preset { excludes.extend(crate::get_excludes_for_preset(preset)); } else if self.apply_default_filters { excludes.extend(crate::get_default_excludes()); } excludes.sort(); excludes.dedup(); excludes } } pub struct Ingester { repo: Repository, pub options: IngestOptions, effective_excludes: Vec, pub cache: Option, pub cache_key: Option, } impl Ingester { pub fn new(repo: Repository, options: IngestOptions) -> Self { let effective_excludes = options.get_effective_excludes(); Self { repo, options, effective_excludes, cache: None, cache_key: None, } } pub fn from_path(path: &Path, options: IngestOptions) -> Result { let repo = Repository::open(path).context("Failed to open repository")?; Ok(Self::new(repo, options)) } pub fn from_url(url: &str, options: IngestOptions) -> Result { let repo = clone_repository(url, options.branch.as_deref())?; Ok(Self::new(repo, options)) } pub fn from_url_cached(url: &str, options: IngestOptions) -> Result { let repo = clone_repository(url, options.branch.as_deref())?; let mut ingester = Self::new(repo, options.clone()); ingester.cache = RepositoryCache::new().ok(); ingester.cache_key = Some(RepositoryCache::generate_cache_key( url, options.branch.as_deref(), )); Ok(ingester) } pub fn get_filter_preset(&self) -> Option { self.options.filter_preset } fn should_include(&self, path: &Path) -> Result { let status = self.repo.status_file(path)?; if status.contains(Status::IGNORED) && !self.options.include_untracked { return Ok(false); } if path.components().any(|c| c.as_os_str() == ".git") { return Ok(false); } let path_str = path.to_string_lossy(); for pattern in &self.effective_excludes { if glob_match(pattern, &path_str) { return Ok(false); } } if !self.options.include_patterns.is_empty() { return Ok(self.options.include_patterns.iter().any(|p| { // Handle directory patterns (ending with /) if p.ends_with("/") { let dir_prefix = &p[..p.len() - 1]; path_str.starts_with(dir_prefix) && path_str.len() > dir_prefix.len() } else if !p.contains('/') { // Pattern without path separator - match filename only path.file_name() .and_then(|n| n.to_str()) .map(|filename| glob_match(p, filename)) .unwrap_or(false) } else { // Pattern with path separator - match full path glob_match(p, &path_str) } })); } Ok(true) } pub fn ingest(&self, output: &mut W) -> Result<()> { let files = self.collect_filtered_files()?; let workdir = self .repo .workdir() .context("Repository has no working directory")?; let mut processed = 0; for file in files { let full_path = workdir.join(&file); if full_path.exists() && full_path.is_file() { self.ingest_file(&full_path, &file, output)?; processed += 1; } } if processed == 0 { eprintln!("Warning: No files found to ingest"); } Ok(()) } pub fn ingest_cached(&mut self, output: &mut W) -> Result<()> { let commit_hash = self.get_current_commit()?; if let Some(ref mut cache) = self.cache { if let Some(ref cache_key) = self.cache_key { match cache.check_commit(cache_key, &commit_hash) { CacheCommitStatus::Match => { if let Ok(Some(cache_entry)) = cache.get(cache_key) { eprintln!("✓ Using cache (commit: {})", &commit_hash[..8]); return self.filter_cached_files(cache_entry, output); } } CacheCommitStatus::Outdated => { eprintln!("↻ Cache outdated, fetching new data..."); let _ = cache.remove(cache_key); } CacheCommitStatus::NotCached => { eprintln!("→ No cache found, fetching repository..."); } } } } let cache_entry = self.fetch_and_cache()?; self.filter_cached_files(cache_entry, output) } fn ingest_file(&self, path: &Path, relative: &Path, output: &mut W) -> Result<()> { let metadata = std::fs::metadata(path)?; if metadata.len() > self.options.max_file_size as u64 { return Ok(()); } let content = std::fs::read_to_string(path).unwrap_or_else(|_| "[Binary file]".to_string()); writeln!(output, "=== {} ===", relative.display())?; writeln!(output, "{content}")?; writeln!(output)?; Ok(()) } fn collect_filtered_files(&self) -> Result> { let head_result = self.repo.head(); let has_commits = head_result.is_ok(); let mut files: Vec = Vec::new(); if has_commits { let head = head_result?; let tree = head.peel_to_tree()?; // when path_prefix is set, walk from that subtree // otherwise walk from root let (tree_to_walk, is_subtree) = if let Some(prefix) = &self.options.path_prefix { match tree.get_path(Path::new(prefix)) { Ok(entry) => (self.repo.find_tree(entry.id())?, true), Err(_) => return Ok(Vec::new()), } } else { (tree, false) }; tree_to_walk.walk(git2::TreeWalkMode::PreOrder, |dir, entry| { if entry.kind() == Some(git2::ObjectType::Blob) { if let Some(name) = entry.name() { let path = if dir.is_empty() { PathBuf::from(name) } else { PathBuf::from(dir).join(name) }; // when walking a subtree, paths are relative to that subtree // prepend the prefix to get the full repository path let full_path = if is_subtree { if let Some(prefix) = &self.options.path_prefix { PathBuf::from(prefix).join(path) } else { path } } else { path }; if self.should_include(&full_path).unwrap_or(false) { files.push(full_path); } } } git2::TreeWalkResult::Ok })?; } // handle untracked files if self.options.include_untracked || !has_commits { let mut status_opts = StatusOptions::new(); status_opts.include_untracked(true); status_opts.include_ignored(false); let statuses = self.repo.statuses(Some(&mut status_opts))?; for status in statuses.iter() { if status.status().contains(Status::WT_NEW) { if let Some(path) = status.path() { let path_buf = PathBuf::from(path); if let Some(prefix) = &self.options.path_prefix { if !path.starts_with(prefix) { continue; } } if self.should_include(&path_buf).unwrap_or(false) { files.push(path_buf); } } } } } files.sort(); files.dedup(); Ok(files) } fn get_current_commit(&self) -> Result { let head = self.repo.head()?; let commit = head.peel_to_commit()?; Ok(commit.id().to_string()) } fn fetch_and_cache(&mut self) -> Result { let workdir = self .repo .workdir() .context("Repository has no working directory")?; let commit_hash = self.get_current_commit()?; let mut files = Vec::new(); let mut total_size = 0u64; let all_files = self.collect_all_repository_files()?; for file_path in all_files { let full_path = workdir.join(&file_path); if !full_path.exists() || !full_path.is_file() { continue; } let metadata = std::fs::metadata(&full_path)?; let content = std::fs::read(&full_path)?; let is_binary = content.iter().take(8000).any(|&b| b == 0); total_size += metadata.len(); files.push(CachedFile { path: file_path, content, size: metadata.len(), is_binary, }); } let cache_entry = CacheEntry { repo_url: self.repo.path().to_string_lossy().to_string(), branch: self .options .branch .clone() .unwrap_or_else(|| "HEAD".to_string()), commit_hash: commit_hash.clone(), files: files.clone(), metadata: CacheMetadata { total_files: files.len(), total_size, tree_hash: commit_hash.clone(), cache_version: "1.0.0".to_string(), }, created_at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), last_accessed: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), }; if let Some(ref mut cache) = self.cache { if let Some(ref cache_key) = self.cache_key { cache.put(cache_key.clone(), cache_entry.clone())?; eprintln!( "✓ Cached {} files ({:.2} MB)", files.len(), total_size as f64 / 1_048_576.0 ); } } Ok(cache_entry) } fn collect_all_repository_files(&self) -> Result> { let mut files = Vec::new(); let head = self.repo.head()?; let tree = head.peel_to_tree()?; tree.walk(git2::TreeWalkMode::PreOrder, |dir, entry| { if entry.kind() == Some(git2::ObjectType::Blob) { if let Some(name) = entry.name() { let path = if dir.is_empty() { PathBuf::from(name) } else { PathBuf::from(dir).join(name) }; files.push(path); } } git2::TreeWalkResult::Ok })?; Ok(files) } fn filter_cached_files(&self, cache_entry: CacheEntry, output: &mut W) -> Result<()> { let mut processed = 0; let mut filtered_size = 0u64; for cached_file in &cache_entry.files { // Apply path_prefix filter first if set if let Some(ref prefix) = self.options.path_prefix { let path_str = cached_file.path.to_string_lossy(); // Ensure we are checking directory boundaries properly let prefix_with_slash = if prefix.ends_with("/") { prefix.to_string() } else { format!("{}/", prefix) }; if !path_str.starts_with(&prefix_with_slash) { continue; } } if !self.should_include(&cached_file.path)? { continue; } if cached_file.size > self.options.max_file_size as u64 { continue; } let content = if cached_file.is_binary { "[Binary file]".to_string() } else { String::from_utf8_lossy(&cached_file.content).to_string() }; writeln!(output, "=== {} ===", cached_file.path.display())?; writeln!(output, "{}", content)?; writeln!(output)?; processed += 1; filtered_size += cached_file.size; } eprintln!( "→ Filtered: {} files ({:.2} MB) from {} total", processed, filtered_size as f64 / 1_048_576.0, cache_entry.metadata.total_files ); Ok(()) } pub fn get_filter_stats(&self) -> Result { let workdir = self .repo .workdir() .context("Repository has no working directory")?; let all_files = self.collect_all_repository_files()?; let mut stats = FilterStats { total_files: all_files.len(), ..Default::default() }; stats.total_files = all_files.len(); for file in all_files { let full_path = workdir.join(&file); if let Ok(metadata) = std::fs::metadata(&full_path) { stats.total_size += metadata.len(); if self.should_include(&file)? { stats.included_files += 1; stats.included_size += metadata.len(); } else { stats.excluded_files += 1; stats.excluded_size += metadata.len(); } } } Ok(stats) } pub fn generate_diff(&self, base: &str, head: &str) -> Result { let repo = &self.repo; let (base_object, _) = repo.revparse_ext(base)?; let (head_object, _) = repo.revparse_ext(head)?; let base_commit = base_object.peel_to_commit()?; let head_commit = head_object.peel_to_commit()?; let base_tree = base_commit.tree()?; let head_tree = head_commit.tree()?; let mut diff_opts = git2::DiffOptions::new(); let diff = repo.diff_tree_to_tree(Some(&base_tree), Some(&head_tree), Some(&mut diff_opts))?; let mut output = String::new(); output.push_str(&format!("# Comparing {} to {}\n\n", base, head)); let stats = diff.stats()?; output.push_str(&format!("Files changed: {}\n", stats.files_changed())); output.push_str(&format!("Insertions: {}\n", stats.insertions())); output.push_str(&format!("Deletions: {}\n\n", stats.deletions())); diff.print(git2::DiffFormat::Patch, |_delta, _hunk, line| { let content = std::str::from_utf8(line.content()).unwrap_or("[binary]"); output.push_str(content); true })?; Ok(output) } pub fn get_metadata(&self) -> Result { let repo = &self.repo; let default_branch = repo .head() .ok() .and_then(|h| h.shorthand().map(String::from)) .unwrap_or_else(|| "main".to_string()); let mut branches = Vec::new(); for (branch, _) in (repo.branches(Some(git2::BranchType::Local))?).flatten() { if let Ok(Some(name)) = branch.name() { branches.push(name.to_string()); } } let remote_url = repo .find_remote("origin") .ok() .and_then(|r| r.url().map(String::from)); let last_commit = repo .head() .ok() .and_then(|h| h.peel_to_commit().ok()) .map(|c| { format!( "{} - {}", c.id().to_string().chars().take(8).collect::(), c.summary().unwrap_or("No message") ) }); let size = repo.workdir().and_then(|w| { walkdir::WalkDir::new(w) .into_iter() .filter_map(|e| e.ok()) .filter_map(|e| e.metadata().ok()) .map(|m| m.len()) .reduce(|a, b| a + b) }); Ok(RepositoryMetadata { url: remote_url.clone().unwrap_or_default(), default_branch, branches, size, last_commit, remote_url, }) } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct FilterStats { pub total_files: usize, pub included_files: usize, pub excluded_files: usize, pub total_size: u64, pub included_size: u64, pub excluded_size: u64, pub excluded_by_filter: usize, } impl FilterStats { pub fn inclusion_rate(&self) -> f64 { if self.total_files == 0 { 0.0 } else { self.included_files as f64 / self.total_files as f64 } } pub fn size_reduction(&self) -> f64 { if self.total_size == 0 { 0.0 } else { self.excluded_size as f64 / self.total_size as f64 } } } pub trait IngestionCallback: Send + Sync { fn on_progress(&mut self, _stage: &str, _message: &str) {} fn on_file(&mut self, _path: &Path, _content: &str) {} fn on_complete(&mut self, _files: usize, _bytes: usize) {} fn on_error(&mut self, _error: &str) {} } === core/src/lib.rs === pub mod cache; pub mod filtering; pub mod ingester; pub mod parser; pub use cache::{ CacheCommitStatus, CacheEntry, CacheManager, CacheStats, CachedFile, RepositoryCache, }; pub use filtering::{get_default_excludes, get_excludes_for_preset, FilterConfig, FilterPreset}; pub use ingester::{FilterStats, IngestOptions, Ingester, IngestionCallback}; pub use parser::{ normalize_source_url, parse_github_url, validate_github_name, GitHubUrlType, ParsedGitHubUrl, }; use anyhow::Result; use git2::Repository; use serde::{Deserialize, Serialize}; use std::io::IsTerminal; use std::path::Path; use std::time::{SystemTime, UNIX_EPOCH}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RepositoryMetadata { pub url: String, pub default_branch: String, pub branches: Vec, pub size: Option, pub last_commit: Option, pub remote_url: Option, } pub fn is_remote_url(source: &str) -> bool { source.starts_with("https://github.com/") || source.starts_with("https://gitlab.com/") || source.starts_with("https://gist.github.com/") || source.starts_with("https://raw.githubusercontent.com/") || source.starts_with("https://gist.githubusercontent.com/") } pub fn clone_repository(url: &str, branch: Option<&str>) -> Result { if !is_remote_url(url) { return Err(anyhow::anyhow!("Invalid or unsafe URL")); } let temp_id = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() .as_millis(); let path = std::env::temp_dir().join(format!("githem-{temp_id}")); let mut fetch_opts = git2::FetchOptions::new(); let mut callbacks = git2::RemoteCallbacks::new(); callbacks.credentials(|url, username_from_url, allowed_types| { if !is_remote_url(url) { return Err(git2::Error::from_str( "Invalid URL for credential authentication", )); } if allowed_types.contains(git2::CredentialType::SSH_KEY) { if let Ok(cred) = git2::Cred::ssh_key_from_agent(username_from_url.unwrap_or("git")) { return Ok(cred); } if let Ok(home) = std::env::var("HOME") { let ssh_dir = Path::new(&home).join(".ssh"); if ssh_dir.exists() { let private_key = ssh_dir.join("id_ed25519"); let public_key = ssh_dir.join("id_ed25519.pub"); if private_key.exists() && public_key.exists() { return git2::Cred::ssh_key( username_from_url.unwrap_or("git"), Some(&public_key), &private_key, None, ); } } } } if allowed_types.contains(git2::CredentialType::DEFAULT) && url.starts_with("https://") { return git2::Cred::default(); } Err(git2::Error::from_str( "No secure authentication method available", )) }); if std::io::stderr().is_terminal() { callbacks.transfer_progress(|stats| { if stats.total_objects() > 0 { eprint!( "\rReceiving objects: {}% ({}/{})", (100 * stats.received_objects()) / stats.total_objects(), stats.received_objects(), stats.total_objects() ); } true }); } fetch_opts.remote_callbacks(callbacks); fetch_opts.depth(1); fetch_opts.download_tags(git2::AutotagOption::None); let mut builder = git2::build::RepoBuilder::new(); builder.fetch_options(fetch_opts); if let Some(branch) = branch { builder.branch(branch); } let repo = builder.clone(url, &path)?; if std::io::stderr().is_terminal() { eprintln!(); } Ok(repo) } pub fn checkout_branch(repo: &Repository, branch_name: &str) -> Result<()> { let (object, reference) = repo.revparse_ext(branch_name)?; repo.checkout_tree(&object, None)?; match reference { Some(gref) => repo.set_head(gref.name().unwrap())?, None => repo.set_head_detached(object.id())?, } Ok(()) } pub fn glob_match(pattern: &str, path: &str) -> bool { if pattern.starts_with("*.") { return path.ends_with(&pattern[1..]); } if let Some(prefix) = pattern.strip_suffix("/*") { return path.starts_with(prefix) && path.len() > prefix.len(); } if pattern.contains('*') { let parts: Vec<&str> = pattern.split('*').collect(); if parts.len() == 2 { return path.starts_with(parts[0]) && path.ends_with(parts[1]); } } path == pattern || path.starts_with(&format!("{pattern}/")) } pub fn estimate_tokens(content: &str) -> usize { let chars = content.len(); let words = content.split_whitespace().count(); let lines = content.lines().count(); ((chars as f32 / 3.3 + words as f32 * 0.75) / 2.0 + lines as f32 * 0.1) as usize } pub fn count_files(content: &str) -> usize { content.matches("=== ").count() } pub fn generate_tree(content: &str) -> String { let mut tree = String::new(); tree.push_str("Repository structure:\n"); for line in content.lines() { if line.starts_with("=== ") && line.ends_with(" ===") { let path = &line[4..line.len() - 4]; tree.push_str(&format!("📄 {path}\n")); } } tree } === core/src/parser.rs === use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ParsedGitHubUrl { pub owner: String, pub repo: String, pub branch: Option, pub path: Option, pub url_type: GitHubUrlType, pub canonical_url: String, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum GitHubUrlType { Repository, Tree, Blob, Raw, Commit, Gist, GistRaw, Compare, } pub fn parse_github_url(url: &str) -> Option { let url = url.trim().trim_end_matches('/'); if url.contains("gist.github.com") { return parse_gist_url(url); } if url.contains("raw.githubusercontent.com") { return parse_raw_url(url); } if let Some(path) = url .strip_prefix("https://github.com/") .or_else(|| url.strip_prefix("http://github.com/")) .or_else(|| url.strip_prefix("github.com/")) { let parts: Vec<&str> = path.split('/').collect(); if parts.len() >= 2 { let owner = parts[0].to_string(); let repo = parts[1].to_string(); if parts.len() == 2 { return Some(ParsedGitHubUrl { owner: owner.clone(), repo: repo.clone(), branch: None, path: None, url_type: GitHubUrlType::Repository, canonical_url: format!("https://github.com/{}/{}", owner, repo), }); } if parts.len() >= 4 { match parts[2] { "tree" | "blob" => { let all_parts = &parts[3..]; if all_parts.is_empty() { return None; } let mut branch_end_idx = all_parts.len(); for (i, part) in all_parts.iter().enumerate() { if part.contains('.') && !part.ends_with(".git") { branch_end_idx = i; break; } if matches!( *part, "src" | "lib" | "test" | "tests" | "docs" | "bin" | "pkg" | "cmd" | "internal" | "api" | "web" | "client" | "server" | "assets" | "public" ) { branch_end_idx = i; break; } } let branch = all_parts[..branch_end_idx].join("/"); let path = if branch_end_idx < all_parts.len() { Some(all_parts[branch_end_idx..].join("/")) } else { None }; return Some(ParsedGitHubUrl { owner: owner.clone(), repo: repo.clone(), branch: Some(branch), path, url_type: if parts[2] == "tree" { GitHubUrlType::Tree } else { GitHubUrlType::Blob }, canonical_url: format!("https://github.com/{}/{}", owner, repo), }); } "commit" => { return Some(ParsedGitHubUrl { owner: owner.clone(), repo: repo.clone(), branch: Some(parts[3].to_string()), path: None, url_type: GitHubUrlType::Commit, canonical_url: format!("https://github.com/{}/{}", owner, repo), }); } "compare" => { let compare_spec = parts[3..].join("/"); return Some(ParsedGitHubUrl { owner: owner.clone(), repo: repo.clone(), branch: Some(compare_spec), path: None, url_type: GitHubUrlType::Compare, canonical_url: format!("https://github.com/{}/{}", owner, repo), }); } _ => {} } } } } None } fn parse_gist_url(url: &str) -> Option { if let Some(path) = url .strip_prefix("https://gist.github.com/") .or_else(|| url.strip_prefix("http://gist.github.com/")) { let parts: Vec<&str> = path.split('/').collect(); if parts.len() == 1 { return Some(ParsedGitHubUrl { owner: "anonymous".to_string(), repo: parts[0].to_string(), branch: None, path: None, url_type: GitHubUrlType::Gist, canonical_url: format!("https://gist.github.com/{}", parts[0]), }); } if parts.len() >= 2 { return Some(ParsedGitHubUrl { owner: parts[0].to_string(), repo: parts[1].to_string(), branch: None, path: None, url_type: GitHubUrlType::Gist, canonical_url: format!("https://gist.github.com/{}/{}", parts[0], parts[1]), }); } } None } fn parse_raw_url(url: &str) -> Option { let path = url .strip_prefix("https://raw.githubusercontent.com/") .or_else(|| url.strip_prefix("http://raw.githubusercontent.com/"))?; let parts: Vec<&str> = path.split('/').collect(); if parts.len() >= 3 { let owner = parts[0].to_string(); let repo = parts[1].to_string(); let branch = parts[2].to_string(); let path = if parts.len() > 3 { Some(parts[3..].join("/")) } else { None }; return Some(ParsedGitHubUrl { owner: owner.clone(), repo: repo.clone(), branch: Some(branch), path, url_type: GitHubUrlType::Raw, canonical_url: format!("https://github.com/{}/{}", owner, repo), }); } None } pub fn normalize_source_url( source: &str, branch: Option, path_prefix: Option, ) -> Result<(String, Option, Option), String> { if let Some(parsed) = parse_github_url(source) { let final_branch = branch.or(parsed.branch); let final_path = path_prefix.or(parsed.path); return Ok((parsed.canonical_url, final_branch, final_path)); } if !source.contains("://") && source.matches('/').count() == 1 { let parts: Vec<&str> = source.split('/').collect(); if parts.len() == 2 && validate_github_name(parts[0]) && validate_github_name(parts[1]) { let url = format!("https://github.com/{}/{}", parts[0], parts[1]); return Ok((url, branch, path_prefix)); } } Ok((source.to_string(), branch, path_prefix)) } pub fn validate_github_name(name: &str) -> bool { !name.is_empty() && name.len() <= 39 && name .chars() .all(|c| c.is_alphanumeric() || c == '-' || c == '_' || c == '.') && !name.starts_with(['-', '.']) && !name.ends_with(['-', '.']) }