38 KiB
38 KiB
Git 仓库统计与缓存系统 - 架构设计文档
1. 总体架构
1.1 模块划分
┌─────────────────────────────────────────────────────────────┐
│ API Layer │
│ ┌────────────┬────────────┬────────────┬─────────────┐ │
│ │ Repo APIs │ Stats APIs │ Task APIs │ Health APIs │ │
│ └────────────┴────────────┴────────────┴─────────────┘ │
└─────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ Service Layer │
│ ┌──────────────────┬──────────────────┬─────────────────┐ │
│ │ RepoService │ StatsService │ TaskService │ │
│ │ - AddRepos │ - Calculate │ - Submit │ │
│ │ - UpdateRepo │ - QueryCache │ - Query │ │
│ │ - SwitchBranch │ - CountCommits │ - Cancel │ │
│ │ - SetCreds │ │ │ │
│ │ - Reset │ │ │ │
│ └──────────────────┴──────────────────┴─────────────────┘ │
└─────────────────────────────────────────────────────────────┘
│
┌────────────────────┼────────────────────┐
▼ ▼ ▼
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Git Manager │ │ Cache Layer │ │ Task Queue │
│ - Clone │ │ - Get/Set │ │ - Enqueue │
│ - Pull │ │ - Invalidate │ │ - Dequeue │
│ - Checkout │ │ - KeyGen │ │ - Dedupe │
│ - Stats │ └──────────────┘ └──────────────┘
│ (cmd/go-git) │ │
└──────────────┘ ▼
│ ┌──────────────────┐
│ │ Worker Pool │
│ │ ┌────────────┐ │
│ │ │ Clone │ │
│ │ │ Pull │ │
│ │ │ Switch │ │
│ │ │ Stats │ │
│ │ │ Reset │ │
│ │ └────────────┘ │
│ └──────────────────┘
▼
┌─────────────────────────────────────────────────────────────┐
│ Storage Layer │
│ ┌──────────────┬──────────────┬──────────────────────────┐ │
│ │ Repo Store │ Task Store │ StatsCache Store │ │
│ │ (SQLite/PG) │ (SQLite/PG) │ (SQLite/PG + Disk) │ │
│ └──────────────┴──────────────┴──────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
│
▼
┌──────────────────┐
│ File System │
│ workspace/cache/│
│ workspace/stats/│
└──────────────────┘
1.2 目录结构
GitCodeStatic/
├── cmd/
│ └── server/
│ └── main.go # 主程序入口
├── internal/
│ ├── api/ # API层
│ │ ├── handlers/ # HTTP handlers
│ │ │ ├── repo.go # 仓库相关API
│ │ │ ├── stats.go # 统计相关API
│ │ │ ├── task.go # 任务相关API
│ │ │ └── health.go # 健康检查API
│ │ ├── middleware/ # 中间件
│ │ │ ├── logger.go # 日志中间件
│ │ │ ├── recovery.go # 恢复中间件
│ │ │ └── metrics.go # 指标中间件
│ │ └── router.go # 路由配置
│ ├── service/ # 服务层
│ │ ├── repo_service.go # 仓库服务
│ │ ├── stats_service.go # 统计服务
│ │ └── task_service.go # 任务服务
│ ├── worker/ # 异步任务处理
│ │ ├── queue.go # 任务队列
│ │ ├── worker.go # Worker实现
│ │ ├── pool.go # Worker池
│ │ └── handlers.go # 任务处理器
│ ├── git/ # Git操作抽象
│ │ ├── manager.go # Git管理器接口
│ │ ├── cmd_git.go # Git命令实现
│ │ └── go_git.go # go-git实现
│ ├── stats/ # 统计模块
│ │ ├── calculator.go # 统计计算器
│ │ ├── parser.go # Git日志解析
│ │ └── models.go # 统计数据模型
│ ├── cache/ # 缓存模块
│ │ ├── cache.go # 缓存接口
│ │ ├── key.go # 缓存key生成
│ │ └── file_cache.go # 文件+DB缓存实现
│ ├── storage/ # 存储层
│ │ ├── interface.go # 存储接口定义
│ │ ├── sqlite/ # SQLite实现
│ │ │ ├── repo.go
│ │ │ ├── task.go
│ │ │ └── stats_cache.go
│ │ └── postgres/ # PostgreSQL实现(可选)
│ │ ├── repo.go
│ │ ├── task.go
│ │ └── stats_cache.go
│ ├── models/ # 数据模型
│ │ ├── repo.go # 仓库模型
│ │ ├── task.go # 任务模型
│ │ └── stats.go # 统计模型
│ ├── config/ # 配置
│ │ └── config.go # 配置结构和加载
│ ├── logger/ # 日志
│ │ └── logger.go # 结构化日志
│ ├── metrics/ # 指标
│ │ └── metrics.go # 基础指标收集
│ └── security/ # 安全
│ ├── credentials.go # 凭据管理
│ └── validator.go # 输入校验
├── pkg/ # 公共库
│ └── utils/
│ ├── hash.go # 哈希工具
│ └── path.go # 路径工具
├── test/ # 测试
│ ├── unit/ # 单元测试
│ └── integration/ # 集成测试
├── configs/ # 配置文件
│ └── config.yaml
├── scripts/ # 脚本
│ └── init_db.sql # 数据库初始化
├── go.mod
├── go.sum
├── Makefile
├── README.md
└── ARCHITECTURE.md # 本文档
2. 数据模型
2.1 表结构设计 (PostgreSQL/SQLite)
2.1.1 仓库表 (repositories)
CREATE TABLE repositories (
id INTEGER PRIMARY KEY AUTOINCREMENT, -- PG: SERIAL PRIMARY KEY
url TEXT NOT NULL UNIQUE, -- 仓库URL
name TEXT NOT NULL, -- 仓库名称(从URL解析)
current_branch TEXT, -- 当前分支
local_path TEXT NOT NULL UNIQUE, -- 本地缓存路径
status TEXT NOT NULL DEFAULT 'pending', -- pending/cloning/ready/failed
error_message TEXT, -- 错误信息
last_pull_at TIMESTAMP, -- 最后拉取时间
last_commit_hash TEXT, -- 最后commit哈希
credential_id TEXT, -- 凭据ID(引用加密存储)
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_repositories_status ON repositories(status);
CREATE INDEX idx_repositories_updated_at ON repositories(updated_at);
2.1.2 任务表 (tasks)
CREATE TABLE tasks (
id INTEGER PRIMARY KEY AUTOINCREMENT, -- PG: SERIAL PRIMARY KEY
task_type TEXT NOT NULL, -- clone/pull/switch/stats/reset/count_commits
repo_id INTEGER NOT NULL, -- 关联仓库ID
status TEXT NOT NULL DEFAULT 'pending', -- pending/running/completed/failed/cancelled
priority INTEGER NOT NULL DEFAULT 0, -- 优先级(数字越大优先级越高)
parameters TEXT, -- JSON格式参数(分支名、统计条件等)
result TEXT, -- JSON格式结果
error_message TEXT, -- 错误信息
retry_count INTEGER NOT NULL DEFAULT 0, -- 重试次数
started_at TIMESTAMP, -- 开始时间
completed_at TIMESTAMP, -- 完成时间
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (repo_id) REFERENCES repositories(id) ON DELETE CASCADE
);
CREATE INDEX idx_tasks_status ON tasks(status);
CREATE INDEX idx_tasks_repo_id ON tasks(repo_id);
CREATE INDEX idx_tasks_type_repo ON tasks(task_type, repo_id, status);
CREATE INDEX idx_tasks_created_at ON tasks(created_at);
-- 任务去重:同一仓库+同一类型+相同参数的任务,pending状态下只允许存在一个
CREATE UNIQUE INDEX idx_tasks_dedup ON tasks(repo_id, task_type, parameters)
WHERE status IN ('pending', 'running');
2.1.3 统计缓存表 (stats_cache)
CREATE TABLE stats_cache (
id INTEGER PRIMARY KEY AUTOINCREMENT, -- PG: SERIAL PRIMARY KEY
repo_id INTEGER NOT NULL, -- 仓库ID
branch TEXT NOT NULL, -- 分支名
constraint_type TEXT NOT NULL, -- date_range/commit_limit
constraint_value TEXT NOT NULL, -- JSON: {"from":"2024-01-01","to":"2024-12-31"} 或 {"limit":100}
commit_hash TEXT NOT NULL, -- 统计截止的commit hash
result_path TEXT NOT NULL, -- 统计结果文件路径
result_size INTEGER NOT NULL, -- 结果文件大小(bytes)
cache_key TEXT NOT NULL UNIQUE, -- 缓存键(用于快速查询)
hit_count INTEGER NOT NULL DEFAULT 0, -- 缓存命中次数
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_hit_at TIMESTAMP, -- 最后命中时间
FOREIGN KEY (repo_id) REFERENCES repositories(id) ON DELETE CASCADE
);
CREATE INDEX idx_stats_cache_key ON stats_cache(cache_key);
CREATE INDEX idx_stats_cache_repo ON stats_cache(repo_id, branch);
CREATE INDEX idx_stats_cache_created_at ON stats_cache(created_at);
-- 唯一约束:同一仓库+分支+约束类型+约束值+commit_hash只能有一条记录
CREATE UNIQUE INDEX idx_stats_cache_unique ON stats_cache(
repo_id, branch, constraint_type, constraint_value, commit_hash
);
2.1.4 凭据表 (credentials) - 加密存储
CREATE TABLE credentials (
id TEXT PRIMARY KEY, -- UUID
username TEXT, -- 用户名(加密)
password TEXT, -- 密码/Token(加密)
auth_type TEXT NOT NULL DEFAULT 'basic', -- basic/token/ssh
encrypted_data BLOB NOT NULL, -- AES加密后的JSON数据
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
3. API 设计
3.1 RESTful API 路由
Base URL: /api/v1
3.1.1 仓库管理 API
批量添加仓库
POST /repos/batch
Content-Type: application/json
Request:
{
"urls": [
"https://github.com/user/repo1.git",
"https://github.com/user/repo2.git"
]
}
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"total": 2,
"succeeded": [
{
"repo_id": 1,
"url": "https://github.com/user/repo1.git",
"task_id": 101
}
],
"failed": [
{
"url": "https://github.com/user/repo2.git",
"error": "repository already exists"
}
]
}
}
获取仓库列表
GET /repos?status=ready&page=1&page_size=20
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"total": 50,
"page": 1,
"page_size": 20,
"repositories": [
{
"id": 1,
"url": "https://github.com/user/repo1.git",
"name": "repo1",
"current_branch": "main",
"status": "ready",
"last_pull_at": "2025-12-31T10:00:00Z",
"last_commit_hash": "abc123...",
"created_at": "2025-12-30T08:00:00Z"
}
]
}
}
获取仓库详情
GET /repos/:id
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"id": 1,
"url": "https://github.com/user/repo1.git",
"name": "repo1",
"current_branch": "main",
"local_path": "/workspace/cache/repo1",
"status": "ready",
"error_message": null,
"last_pull_at": "2025-12-31T10:00:00Z",
"last_commit_hash": "abc123...",
"has_credentials": true,
"created_at": "2025-12-30T08:00:00Z",
"updated_at": "2025-12-31T10:00:00Z"
}
}
切换分支
POST /repos/:id/switch-branch
Content-Type: application/json
Request:
{
"branch": "develop"
}
Response: 200 OK
{
"code": 0,
"message": "branch switch task submitted",
"data": {
"task_id": 102,
"repo_id": 1,
"task_type": "switch",
"status": "pending"
}
}
更新仓库(pull)
POST /repos/:id/update
Response: 200 OK
{
"code": 0,
"message": "update task submitted",
"data": {
"task_id": 103,
"repo_id": 1,
"task_type": "pull",
"status": "pending"
}
}
设置凭据
POST /repos/:id/credentials
Content-Type: application/json
Request:
{
"auth_type": "basic", // basic/token
"username": "user",
"password": "token_or_password"
}
Response: 200 OK
{
"code": 0,
"message": "credentials set successfully",
"data": {
"credential_id": "uuid-here"
}
}
重置仓库
POST /repos/:id/reset
Response: 200 OK
{
"code": 0,
"message": "reset task submitted",
"data": {
"task_id": 104,
"repo_id": 1,
"task_type": "reset",
"status": "pending"
}
}
删除仓库
DELETE /repos/:id
Response: 200 OK
{
"code": 0,
"message": "repository deleted successfully"
}
3.1.2 统计 API
触发统计
POST /stats/calculate
Content-Type: application/json
Request:
{
"repo_id": 1,
"branch": "main",
"constraint": {
"type": "date_range", // date_range 或 commit_limit (互斥)
"from": "2024-01-01", // type=date_range时必填
"to": "2024-12-31" // type=date_range时必填
}
}
OR
{
"repo_id": 1,
"branch": "main",
"constraint": {
"type": "commit_limit",
"limit": 100 // type=commit_limit时必填
}
}
Response: 200 OK
{
"code": 0,
"message": "statistics task submitted",
"data": {
"task_id": 105,
"repo_id": 1,
"task_type": "stats",
"status": "pending"
}
}
Error: 400 Bad Request (参数互斥校验)
{
"code": 40001,
"message": "constraint type and parameters mismatch: date_range requires from/to, commit_limit requires limit",
"data": null
}
查询统计结果
GET /stats/result?repo_id=1&branch=main&constraint_type=date_range&from=2024-01-01&to=2024-12-31
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"cache_hit": true,
"cached_at": "2025-12-30T15:00:00Z",
"commit_hash": "abc123...",
"statistics": {
"summary": {
"total_commits": 150,
"total_contributors": 5,
"date_range": {
"from": "2024-01-01",
"to": "2024-12-31"
}
},
"by_contributor": [
{
"author": "Alice",
"email": "alice@example.com",
"commits": 50,
"additions": 1000,
"deletions": 200,
"modifications": 150, // 口径: min(additions, deletions)
"net_additions": 800 // additions - deletions
}
]
}
}
}
Response: 404 Not Found (未统计)
{
"code": 40400,
"message": "statistics not found, please submit calculation task first",
"data": null
}
查询某日期到当前的提交次数(辅助查询)
GET /stats/commit-count?repo_id=1&branch=main&from=2024-01-01
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"repo_id": 1,
"branch": "main",
"from": "2024-01-01",
"to": "HEAD",
"commit_count": 150,
"queried_at": "2025-12-31T12:00:00Z"
}
}
3.1.3 任务管理 API
获取任务列表
GET /tasks?repo_id=1&status=running&page=1&page_size=20
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"total": 3,
"page": 1,
"page_size": 20,
"tasks": [
{
"id": 105,
"task_type": "stats",
"repo_id": 1,
"status": "running",
"parameters": "{\"branch\":\"main\",\"constraint\":{...}}",
"started_at": "2025-12-31T12:00:00Z",
"created_at": "2025-12-31T11:59:00Z"
}
]
}
}
获取任务详情
GET /tasks/:id
Response: 200 OK
{
"code": 0,
"message": "success",
"data": {
"id": 105,
"task_type": "stats",
"repo_id": 1,
"status": "completed",
"parameters": "{\"branch\":\"main\",\"constraint\":{...}}",
"result": "{\"cache_key\":\"...\",\"stats_cache_id\":10}",
"error_message": null,
"retry_count": 0,
"started_at": "2025-12-31T12:00:00Z",
"completed_at": "2025-12-31T12:05:00Z",
"created_at": "2025-12-31T11:59:00Z",
"duration_ms": 300000
}
}
取消任务
POST /tasks/:id/cancel
Response: 200 OK
{
"code": 0,
"message": "task cancelled successfully"
}
Response: 400 Bad Request (任务已完成)
{
"code": 40002,
"message": "task cannot be cancelled: already completed",
"data": null
}
3.1.4 健康检查 API
GET /health
Response: 200 OK
{
"status": "healthy",
"timestamp": "2025-12-31T12:00:00Z",
"components": {
"database": "ok",
"worker_pool": "ok",
"git_available": true
}
}
3.2 错误码设计
0 - 成功
40001 - 参数校验失败(互斥参数、缺失参数等)
40002 - 操作不允许(任务状态不正确等)
40400 - 资源未找到
40900 - 资源冲突(仓库已存在等)
50000 - 内部服务器错误
50001 - 数据库错误
50002 - Git操作失败
50003 - 任务队列错误
4. 异步任务与并发设计
4.1 任务类型
const (
TaskTypeClone = "clone" // 克隆仓库
TaskTypePull = "pull" // 更新仓库
TaskTypeSwitch = "switch" // 切换分支
TaskTypeReset = "reset" // 重置仓库
TaskTypeStats = "stats" // 统计代码
TaskTypeCountCommits = "count_commits" // 计数提交
)
4.2 任务队列架构
┌─────────────┐
│ Submit │
│ Task │
└──────┬──────┘
│
▼
┌─────────────────────────────────┐
│ Task Deduplication │
│ (Check unique index in DB) │
└──────┬──────────────────────────┘
│
▼
┌─────────────────────────────────┐
│ In-Memory Queue │
│ (Buffered Channel) │
│ - Priority Queue │
│ - FIFO within same priority │
└──────┬──────────────────────────┘
│
▼
┌─────────────────────────────────┐
│ Worker Pool │
│ ┌──────────┐ ┌──────────┐ │
│ │ Worker 1 │ │ Worker 2 │... │
│ └────┬─────┘ └────┬─────┘ │
└───────┼─────────────┼───────────┘
│ │
▼ ▼
┌────────────────────────┐
│ Task Handlers │
│ - CloneHandler │
│ - PullHandler │
│ - StatsHandler │
│ ... │
└────────────────────────┘
4.3 幂等与去重策略
-
数据库层去重:通过唯一索引
idx_tasks_dedup实现- 同一
repo_id+task_type+parameters的 pending/running 任务只能存在一个 - 提交任务时先查询,若存在则返回已有任务ID
- 同一
-
任务合并:
- 相同参数的任务自动合并为一个
- 返回相同的 task_id 给所有提交者
-
幂等性保证:
- Clone: 检查本地目录是否已存在,存在则跳过
- Pull: 可重复执行,git pull 本身幂等
- Switch: 检查当前分支是否已是目标分支
- Stats: 缓存命中则跳过计算
- Reset: 删除目录+缓存后重新 clone
4.4 并发控制
worker_pool:
clone_workers: 2 # Clone 并发度(IO密集型,限制较小)
pull_workers: 2 # Pull 并发度
stats_workers: 2 # Stats 并发度(CPU密集型,根据CPU核心数配置)
general_workers: 4 # 其他任务并发度
4.5 超时策略
const (
CloneTimeout = 10 * time.Minute // 克隆超时
PullTimeout = 5 * time.Minute // 拉取超时
SwitchTimeout = 1 * time.Minute // 切换分支超时
StatsTimeout = 30 * time.Minute // 统计超时(大仓库可能很慢)
CountCommitsTimeout = 2 * time.Minute // 计数超时
)
4.6 重试策略
- 网络错误:最多重试 3 次,指数退避(1s, 2s, 4s)
- 认证错误:不重试,直接失败
- 超时:不重试,直接失败
- 其他错误:重试 1 次
5. 统计实现细节
5.1 Git 命令方案(优先)
统计命令
# 统计所有贡献者的代码变更
git log --no-merges --numstat --pretty=format:"COMMIT:%H|AUTHOR:%an|EMAIL:%ae|DATE:%ai" \
--since="2024-01-01" --until="2024-12-31"
# 输出格式:
COMMIT:abc123|AUTHOR:Alice|EMAIL:alice@example.com|DATE:2024-01-15 10:00:00 +0800
100 50 src/main.go
200 30 src/utils.go
COMMIT:def456|AUTHOR:Bob|EMAIL:bob@example.com|DATE:2024-01-16 11:00:00 +0800
50 10 src/test.go
解析逻辑
对于每个文件变更:
additions: 新增行数
deletions: 删除行数
modifications: min(additions, deletions) # 修改的定义:被替换的行数
net_additions: additions - deletions # 净增加
按作者聚合:
total_additions = sum(additions)
total_deletions = sum(deletions)
total_modifications = sum(modifications)
total_net_additions = total_additions - total_deletions
提交次数统计
# 按日期范围
git rev-list --count --since="2024-01-01" --until="2024-12-31" HEAD
# 按提交数限制
git log --oneline -n 100 | wc -l
5.2 go-git 方案(Fallback)
// 伪代码
repo, _ := git.PlainOpen(repoPath)
ref, _ := repo.Head()
commits, _ := repo.Log(&git.LogOptions{From: ref.Hash()})
contributors := make(map[string]*ContributorStats)
commits.ForEach(func(c *object.Commit) error {
if len(c.ParentHashes) > 1 {
return nil // Skip merge commits
}
parent, _ := c.Parent(0)
patch, _ := parent.Patch(c)
stats := patch.Stats()
for _, fileStat := range stats {
contributors[c.Author.Email].Additions += fileStat.Addition
contributors[c.Author.Email].Deletions += fileStat.Deletion
}
return nil
})
限制说明:
- go-git 的 diff 性能比 git 命令慢(特别是大仓库)
- 作为 fallback 方案,功能等价但性能可能差 10-100 倍
- 建议生产环境保证 git 命令可用
5.3 互斥参数校验
func ValidateStatsConstraint(req *StatsRequest) error {
c := req.Constraint
if c.Type == "date_range" {
if c.From == "" || c.To == "" {
return errors.New("date_range requires both from and to")
}
if c.Limit != 0 {
return errors.New("date_range cannot be used with limit")
}
} else if c.Type == "commit_limit" {
if c.Limit <= 0 {
return errors.New("commit_limit requires positive limit value")
}
if c.From != "" || c.To != "" {
return errors.New("commit_limit cannot be used with date range")
}
} else {
return errors.New("constraint type must be date_range or commit_limit")
}
return nil
}
6. 缓存策略
6.1 缓存 Key 设计
func GenerateCacheKey(repoID int64, branch string, constraint Constraint, commitHash string) string {
var constraintStr string
if constraint.Type == "date_range" {
constraintStr = fmt.Sprintf("dr_%s_%s", constraint.From, constraint.To)
} else {
constraintStr = fmt.Sprintf("cl_%d", constraint.Limit)
}
data := fmt.Sprintf("repo:%d|branch:%s|constraint:%s|commit:%s",
repoID, branch, constraintStr, commitHash)
hash := sha256.Sum256([]byte(data))
return hex.EncodeToString(hash[:])
}
6.2 缓存失效策略
触发失效的操作:
- 仓库更新(pull): 如果有新提交,则
commit_hash变化,旧缓存自然失效 - 切换分支(switch): 分支变化,缓存 key 不同
- 重置仓库(reset): 删除该仓库的所有统计缓存
查询时:
// 1. 先获取当前 HEAD 的 commit hash
currentHash := getHeadCommitHash(repo, branch)
// 2. 生成缓存 key
cacheKey := GenerateCacheKey(repoID, branch, constraint, currentHash)
// 3. 查询缓存
cache, found := queryCacheByKey(cacheKey)
if found {
cache.HitCount++
cache.LastHitAt = time.Now()
return cache.LoadResult()
}
// 4. 缓存未命中,执行统计
...
6.3 存储方案
1. 元数据存储: 数据库 (stats_cache 表)
- cache_key, repo_id, branch, constraint, commit_hash
- result_path, result_size, hit_count, created_at, last_hit_at
2. 结果数据存储: 文件系统
- Path: workspace/stats/{cache_key}.json.gz
- Format: gzip 压缩的 JSON
- 清理策略: LRU(最近最少使用),保留最近 30 天或最多 10GB
6.4 大小控制
cache:
max_total_size: 10GB # 总缓存大小限制
max_single_result: 100MB # 单个结果文件大小限制
retention_days: 30 # 保留天数
cleanup_interval: 1h # 清理检查间隔
7. 安全与凭据
7.1 凭据存储
// 使用 AES-256-GCM 加密
type CredentialManager struct {
encryptionKey []byte // 从环境变量或配置文件读取
}
func (cm *CredentialManager) EncryptCredential(cred *Credential) ([]byte, error) {
plaintext, _ := json.Marshal(cred)
block, _ := aes.NewCipher(cm.encryptionKey)
gcm, _ := cipher.NewGCM(block)
nonce := make([]byte, gcm.NonceSize())
io.ReadFull(rand.Reader, nonce)
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
return ciphertext, nil
}
7.2 日志脱敏
func SanitizeURL(url string) string {
// 移除 URL 中的用户名密码
re := regexp.MustCompile(`(https?://)[^@]+@`)
return re.ReplaceAllString(url, "${1}***@")
}
// 日志输出示例
log.Info("cloning repository",
"repo_id", repoID,
"url", SanitizeURL(repoURL), // https://***@github.com/user/repo.git
)
7.3 Git 凭据注入
Git 命令方案
// 方式1: 使用 credential helper
os.Setenv("GIT_ASKPASS", "/path/to/credential-helper.sh")
// 方式2: URL 重写(临时使用)
func InjectCredentials(url, username, password string) string {
u, _ := neturl.Parse(url)
u.User = neturl.UserPassword(username, password)
return u.String()
}
// 执行命令时
cmd := exec.Command("git", "clone", credentialURL, localPath)
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") // 禁止交互式提示
go-git 方案
auth := &http.BasicAuth{
Username: username,
Password: password,
}
_, err := git.PlainClone(localPath, false, &git.CloneOptions{
URL: url,
Auth: auth,
Progress: os.Stdout,
})
7.4 命令注入防护
// 禁止直接拼接用户输入到命令中
// ❌ 错误示例
cmd := exec.Command("sh", "-c", "git log "+userInput)
// ✅ 正确示例
cmd := exec.Command("git", "log", userInput) // 使用参数数组
// 路径隔离
func ValidateRepoPath(path string) error {
abs, _ := filepath.Abs(path)
workspace, _ := filepath.Abs(config.WorkspaceDir)
if !strings.HasPrefix(abs, workspace) {
return errors.New("path outside workspace")
}
return nil
}
8. 可观测性
8.1 结构化日志
// 使用 zerolog 或 logrus
log.Info().
Int64("repo_id", repoID).
Str("task_id", taskID).
Str("operation", "clone").
Int64("duration_ms", duration.Milliseconds()).
Str("status", "success").
Msg("repository cloned successfully")
8.2 关键指标
// 使用 Prometheus 风格的指标
var (
// 任务指标
taskTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{Name: "tasks_total"},
[]string{"type", "status"}, // clone/pull/stats, success/failed
)
taskDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "task_duration_seconds",
Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800},
},
[]string{"type"},
)
// 缓存指标
cacheHits = prometheus.NewCounter(
prometheus.CounterOpts{Name: "stats_cache_hits_total"},
)
cacheMisses = prometheus.NewCounter(
prometheus.CounterOpts{Name: "stats_cache_misses_total"},
)
// Worker 指标
workerBusy = prometheus.NewGaugeVec(
prometheus.GaugeOpts{Name: "worker_busy"},
[]string{"type"}, // clone/stats/general
)
queueLength = prometheus.NewGauge(
prometheus.GaugeOpts{Name: "task_queue_length"},
)
)
// 暴露指标端点
http.Handle("/metrics", promhttp.Handler())
8.3 错误分类
const (
ErrCategoryNetwork = "network" // 网络错误
ErrCategoryAuth = "auth" // 认证错误
ErrCategoryNotFound = "not_found" // 仓库/分支不存在
ErrCategoryTimeout = "timeout" // 超时
ErrCategoryInternal = "internal" // 内部错误
ErrCategoryValidation = "validation" // 参数校验错误
)
func ClassifyGitError(err error) string {
errMsg := err.Error()
if strings.Contains(errMsg, "authentication") || strings.Contains(errMsg, "401") {
return ErrCategoryAuth
}
if strings.Contains(errMsg, "not found") || strings.Contains(errMsg, "404") {
return ErrCategoryNotFound
}
if strings.Contains(errMsg, "timeout") || strings.Contains(errMsg, "deadline exceeded") {
return ErrCategoryTimeout
}
if strings.Contains(errMsg, "connection refused") || strings.Contains(errMsg, "network") {
return ErrCategoryNetwork
}
return ErrCategoryInternal
}
9. 假设与默认配置
9.1 部署假设
- 单机部署优先(可扩展到多实例,需引入分布式锁/消息队列)
- 运行环境:Linux (Ubuntu 20.04+)
- Go 版本:1.21+
- Git 版本:2.30+(推荐)
9.2 默认配置
server:
host: 0.0.0.0
port: 8080
read_timeout: 30s
write_timeout: 30s
workspace:
base_dir: ./workspace
cache_dir: ./workspace/cache # 仓库缓存目录
stats_dir: ./workspace/stats # 统计结果目录
storage:
type: sqlite # sqlite/postgres
sqlite:
path: ./workspace/data.db
postgres:
host: localhost
port: 5432
database: gitcodestatic
user: postgres
password: ""
sslmode: disable
worker:
clone_workers: 2
pull_workers: 2
stats_workers: 2
general_workers: 4
queue_buffer: 100 # 内存队列缓冲大小
cache:
max_total_size: 10737418240 # 10GB
max_single_result: 104857600 # 100MB
retention_days: 30
cleanup_interval: 3600 # 1 hour
security:
encryption_key: "" # 从环境变量 ENCRYPTION_KEY 读取
git:
command_path: /usr/bin/git # Git 命令路径(为空则从 PATH 查找)
fallback_to_gogit: true # 是否 fallback 到 go-git
log:
level: info # debug/info/warn/error
format: json # json/text
output: stdout # stdout/file path
metrics:
enabled: true
path: /metrics
9.3 资源限制假设
- 仓库规模:单仓库最大 5GB
- 并发请求:50 QPS
- 同时处理的仓库数:10 个
- 单次批量添加仓库数:最多 20 个
附录:运行流程示例
流程1:批量添加仓库
1. POST /api/v1/repos/batch
└─> RepoService.AddRepos()
├─> 校验 URL 格式
├─> 检查是否已存在(去重)
├─> 创建 Repository 记录(status=pending)
├─> 提交 Clone 任务到队列
└─> 返回 task_id 列表
2. Worker 异步处理 Clone 任务
└─> CloneHandler()
├─> 更新任务状态为 running
├─> 更新仓库状态为 cloning
├─> 调用 GitManager.Clone()
│ ├─> 优先使用 git command
│ └─> fallback to go-git(如果配置允许)
├─> 获取当前分支和 HEAD commit hash
├─> 更新仓库状态为 ready
└─> 更新任务状态为 completed
3. GET /api/v1/repos/:id
└─> 查询仓库状态(ready)
流程2:统计代码并缓存
1. POST /api/v1/stats/calculate
└─> StatsService.Calculate()
├─> 校验参数(互斥检查)
├─> 检查仓库状态(必须是 ready)
├─> 提交 Stats 任务到队列
└─> 返回 task_id
2. Worker 异步处理 Stats 任务
└─> StatsHandler()
├─> 更新任务状态为 running
├─> 生成缓存 key(基于 repo/branch/constraint/commit_hash)
├─> 检查缓存是否存在
│ └─> 如果存在,直接返回
├─> 调用 StatsCalculator.Calculate()
│ ├─> 执行 git log --numstat
│ ├─> 解析输出,按作者聚合
│ └─> 计算 additions/deletions/modifications/net
├─> 保存结果到文件(gzip压缩)
├─> 创建 stats_cache 记录
├─> 更新任务状态为 completed
└─> 任务结果中记录 cache_id
3. GET /api/v1/stats/result?...
└─> StatsService.QueryResult()
├─> 生成缓存 key
├─> 查询 stats_cache 表
├─> 如果命中,更新 hit_count 和 last_hit_at
├─> 读取结果文件
└─> 返回(cache_hit=true)
下一步:代码实现
接下来我将生成完整的可运行代码骨架。