f512d94c74
git-subtree-dir: clients/ds-app git-subtree-mainline:a24e3e6f22git-subtree-split:5206cf3b0c
162 lines
6.4 KiB
Swift
162 lines
6.4 KiB
Swift
// LocalMLXProvider.swift — S2 구현 (맥미니 메인 로컬 LLM 허브).
|
|
//
|
|
// 맥미니 Gemma 4 26B, llm-router :8890 (OpenAI 호환, wake-on-call). 결정 #4: raw MLX :8801 폐기.
|
|
// - isAvailable = GET /v1/models 경량 probe(짧은 timeout, wake 미트리거 → '가용인데 콜드' 정상)
|
|
// - complete = POST /v1/chat/completions, messages system/user 분리(call-shape 고정)
|
|
//
|
|
// fixture(llm-router-chat.*.json) = CAPTURED_LIVE (2026-06-05, Tailscale 100.76.254.116:8890 실측).
|
|
// 요청 model='mac-mini-default'(별칭) → 응답 model='mlx-community/gemma-4-26b-a4b-it-8bit'.
|
|
import Foundation
|
|
|
|
public struct LocalMLXProvider: AIProvider {
|
|
public let id: AIProviderID = .localMLX
|
|
|
|
/// 맥미니 허브 베이스 URL (S2-Fa config 에서 주입). trailing slash 없는 base, 경로는 appendingPathComponent.
|
|
public let baseURL: URL
|
|
let model: String
|
|
let session: URLSession
|
|
let requestTimeout: TimeInterval
|
|
let probeTimeout: TimeInterval
|
|
|
|
public init(
|
|
baseURL: URL,
|
|
model: String = "mac-mini-default", // llm-router 별칭(라이브 /v1/models 확인) → gemma-4-26b resolve
|
|
session: URLSession = .shared,
|
|
requestTimeout: TimeInterval = 60,
|
|
probeTimeout: TimeInterval = 2
|
|
) {
|
|
self.baseURL = baseURL
|
|
self.model = model
|
|
self.session = session
|
|
self.requestTimeout = requestTimeout
|
|
self.probeTimeout = probeTimeout
|
|
}
|
|
|
|
// MARK: isAvailable — 경량 health probe (wake 미트리거)
|
|
|
|
public var isAvailable: Bool {
|
|
get async {
|
|
var req = URLRequest(url: baseURL.appendingPathComponent("v1/models"))
|
|
req.httpMethod = "GET"
|
|
req.timeoutInterval = probeTimeout
|
|
do {
|
|
let (_, resp) = try await session.data(for: req)
|
|
guard let http = resp as? HTTPURLResponse else { return false }
|
|
return (200..<300).contains(http.statusCode)
|
|
} catch {
|
|
// timeout/연결오류 → false(throw 아님). probe 는 wake/모델로드 유발 X.
|
|
// '가용=응답가능 ≠ 즉답' — 콜드 모델은 첫 complete() 가 로드지연 흡수.
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
// MARK: complete — OpenAI 호환 chat/completions
|
|
|
|
public func complete(_ request: AICompletionRequest) async throws -> AICompletionResponse {
|
|
try Task.checkCancellation()
|
|
|
|
var req = URLRequest(url: baseURL.appendingPathComponent("v1/chat/completions"))
|
|
req.httpMethod = "POST"
|
|
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
|
req.timeoutInterval = requestTimeout // S2-Fe: 행 걸린 생성 무한정지 방지
|
|
req.httpBody = try Self.encodeRequest(request, model: model)
|
|
|
|
let started = Date()
|
|
let data: Data
|
|
let resp: URLResponse
|
|
do {
|
|
// URLSession async 는 Task 취소를 자동 honor → CancellationError 전파(S2-Fe).
|
|
(data, resp) = try await session.data(for: req)
|
|
} catch let e as URLError where e.code == .timedOut {
|
|
throw AIProviderError.backendError(id, status: -1, reason: "request timed out after \(Int(requestTimeout))s")
|
|
}
|
|
|
|
guard let http = resp as? HTTPURLResponse else {
|
|
throw AIProviderError.backendError(id, status: -1, reason: "non-HTTP response")
|
|
}
|
|
guard (200..<300).contains(http.statusCode) else {
|
|
// non-200 → backendError (빈 text 침묵 금지).
|
|
let reason = String(data: data, encoding: .utf8).map { String($0.prefix(300)) }
|
|
throw AIProviderError.backendError(id, status: http.statusCode, reason: reason)
|
|
}
|
|
|
|
let decoded = try JSONDecoder().decode(OpenAIChatResponse.self, from: data)
|
|
guard let choice = decoded.choices.first else {
|
|
throw AIProviderError.backendError(id, status: http.statusCode, reason: "no choices in response")
|
|
}
|
|
return AICompletionResponse(
|
|
text: choice.message.content,
|
|
providerUsed: .localMLX,
|
|
finishReason: Self.finishReason(choice.finishReason),
|
|
citations: [], // 로컬 생성 — 코퍼스 인용 없음
|
|
confidence: nil,
|
|
latencyMs: Date().timeIntervalSince(started) * 1000,
|
|
routingNote: nil // fallback note 는 라우터가 채움
|
|
)
|
|
}
|
|
|
|
// MARK: 매핑
|
|
|
|
static func finishReason(_ openAI: String?) -> AIFinishReason {
|
|
switch openAI {
|
|
case "stop": return .completed
|
|
case "length": return .completed // max_tokens 절단 — 정상 완료로 취급
|
|
default: return .completed
|
|
}
|
|
}
|
|
|
|
/// AICompletionRequest → OpenAI chat/completions body. messages system/user 분리(fixture 와 단일 source-of-truth).
|
|
/// system.content = systemPrompt ?? "" (plan S2-2c). temperature 는 AICompletionRequest 에 없음(동결) → 미설정(서버 기본).
|
|
static func encodeRequest(_ request: AICompletionRequest, model: String) throws -> Data {
|
|
let body = OpenAIChatRequest(
|
|
model: model,
|
|
messages: [
|
|
OpenAIChatRequest.Message(role: "system", content: request.systemPrompt ?? ""),
|
|
OpenAIChatRequest.Message(role: "user", content: request.prompt),
|
|
],
|
|
maxTokens: request.maxTokens,
|
|
stream: false
|
|
)
|
|
let enc = JSONEncoder()
|
|
enc.outputFormatting = [.sortedKeys]
|
|
return try enc.encode(body)
|
|
}
|
|
}
|
|
|
|
// MARK: - OpenAI 호환 wire 타입 (내부)
|
|
|
|
struct OpenAIChatRequest: Encodable, Sendable {
|
|
struct Message: Encodable, Sendable {
|
|
let role: String
|
|
let content: String
|
|
}
|
|
let model: String
|
|
let messages: [Message]
|
|
let maxTokens: Int?
|
|
let stream: Bool
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case model, messages, stream
|
|
case maxTokens = "max_tokens"
|
|
}
|
|
}
|
|
|
|
struct OpenAIChatResponse: Decodable, Sendable {
|
|
struct Choice: Decodable, Sendable {
|
|
struct Message: Decodable, Sendable {
|
|
let role: String
|
|
let content: String
|
|
}
|
|
let index: Int?
|
|
let message: Message
|
|
let finishReason: String?
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case index, message
|
|
case finishReason = "finish_reason"
|
|
}
|
|
}
|
|
let choices: [Choice]
|
|
}
|