from services.ollama_client import ollama_client from db.vector_store import vector_store from db.metadata_store import metadata_store from services.db_client import get_all_issues, get_issue_by_id, get_issues_since def build_document_text(issue: dict) -> str: parts = [] if issue.get("description"): parts.append(issue["description"]) if issue.get("final_description"): parts.append(issue["final_description"]) if issue.get("detail_notes"): parts.append(issue["detail_notes"]) if issue.get("solution"): parts.append(f"해결: {issue['solution']}") if issue.get("management_comment"): parts.append(f"의견: {issue['management_comment']}") if issue.get("cause_detail"): parts.append(f"원인: {issue['cause_detail']}") return " ".join(parts) def build_metadata(issue: dict) -> dict: meta = {"issue_id": issue["id"]} for key in [ "category", "project_id", "review_status", "responsible_department", "location_info", ]: val = issue.get(key) if val is not None: meta[key] = str(val) rd = issue.get("report_date") if rd: meta["report_date"] = str(rd)[:10] meta["has_solution"] = "true" if issue.get("solution") else "false" return meta BATCH_SIZE = 10 async def _sync_issues_batch(issues: list[dict]) -> tuple[int, int]: """배치 단위로 임베딩 생성 후 벡터 스토어에 저장""" synced = 0 skipped = 0 # 유효한 이슈와 텍스트 준비 valid = [] for issue in issues: doc_text = build_document_text(issue) if not doc_text.strip(): skipped += 1 continue valid.append((issue, doc_text)) # 배치 단위로 임베딩 생성 for i in range(0, len(valid), BATCH_SIZE): batch = valid[i:i + BATCH_SIZE] texts = [doc_text for _, doc_text in batch] try: embeddings = await ollama_client.batch_embeddings(texts) for (issue, doc_text), embedding in zip(batch, embeddings): vector_store.upsert( doc_id=f"issue_{issue['id']}", document=doc_text, embedding=embedding, metadata=build_metadata(issue), ) synced += 1 except Exception: skipped += len(batch) return synced, skipped async def sync_all_issues() -> dict: issues = get_all_issues() synced, skipped = await _sync_issues_batch(issues) if issues: max_id = max(i["id"] for i in issues) metadata_store.set_last_synced_id(max_id) return {"synced": synced, "skipped": skipped, "total": len(issues)} async def sync_single_issue(issue_id: int) -> dict: issue = get_issue_by_id(issue_id) if not issue: return {"status": "not_found"} doc_text = build_document_text(issue) if not doc_text.strip(): return {"status": "empty_text"} embedding = await ollama_client.generate_embedding(doc_text) vector_store.upsert( doc_id=f"issue_{issue['id']}", document=doc_text, embedding=embedding, metadata=build_metadata(issue), ) return {"status": "synced", "issue_id": issue_id} async def sync_incremental() -> dict: last_id = metadata_store.get_last_synced_id() issues = get_issues_since(last_id) synced, skipped = await _sync_issues_batch(issues) if issues: max_id = max(i["id"] for i in issues) metadata_store.set_last_synced_id(max_id) return {"synced": synced, "skipped": skipped, "new_issues": len(issues)} async def search_similar_by_id(issue_id: int, n_results: int = 5) -> list[dict]: issue = get_issue_by_id(issue_id) if not issue: return [] doc_text = build_document_text(issue) if not doc_text.strip(): return [] embedding = await ollama_client.generate_embedding(doc_text) results = vector_store.query( embedding=embedding, n_results=n_results + 1, ) # exclude self filtered = [] for r in results: if r["id"] != f"issue_{issue_id}": filtered.append(r) return filtered[:n_results] async def search_similar_by_text(query: str, n_results: int = 5, filters: dict = None) -> list[dict]: embedding = await ollama_client.generate_embedding(query) where = None if filters: conditions = [] for k, v in filters.items(): if v is not None: conditions.append({k: str(v)}) if len(conditions) == 1: where = conditions[0] elif len(conditions) > 1: where = {"$and": conditions} return vector_store.query( embedding=embedding, n_results=n_results, where=where, )