5383a93f98
risk-first 채움(RemoteDS→LocalMLX→OnDevice→Specialized) + makeDefaultRouter 컴포지션 루트. 동결 인터페이스(AIProvider/AIRouter/MockAIProvider) 무변경. SPM AIFabric 단독 빌드·테스트(46 PASS). - RemoteDS: DSAskClient seam + AskResponse(ask.json) 매핑 + backend exhaustive switch(qwen/cloud TODO) - LocalMLX: GET /v1/models probe + OpenAI /v1/chat/completions system/user call-shape + non-200 backendError - OnDevice: FoundationModels 라이브(M5 Max) availability + respond() + GenerationError 9-case 매핑 + stateless/prewarm - Specialized: scaffold-only(명시 unavailable, vision 폴백 가시화), cloud='claude-cloud' 503 - config 단일소스(env override) + 타임아웃/취소(URLSession 자동 honor, OnDevice 협조적) 실측 동결(S2-3a, M5 Max): availability=available · 취소=COOPERATIVE(~33ms) · 오버플로=exceededContextWindowSize · GenerationError 9-case(refusal·concurrentRequests 추가 발견, plan 정정). 한계: LocalMLX fixture=PROVISIONAL_SYNTHETIC(맥미니 offline → 라이브 재캡처 S2-Ff 대기). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
162 lines
6.4 KiB
Swift
162 lines
6.4 KiB
Swift
// LocalMLXProvider.swift — S2 구현 (맥미니 메인 로컬 LLM 허브).
|
||
//
|
||
// 맥미니 Gemma 4 26B, llm-router :8890 (OpenAI 호환, wake-on-call). 결정 #4: raw MLX :8801 폐기.
|
||
// - isAvailable = GET /v1/models 경량 probe(짧은 timeout, wake 미트리거 → '가용인데 콜드' 정상)
|
||
// - complete = POST /v1/chat/completions, messages system/user 분리(call-shape 고정)
|
||
//
|
||
// ⚠️ fixture(llm-router-chat.*.json) = PROVISIONAL_SYNTHETIC (맥미니 offline 으로 라이브 캡처 불가, 2026-06-04).
|
||
// OpenAI 표준 스펙 기반. 맥미니 복귀 시 S2-2a 라이브 재캡처로 교체(S2-Ff). 코드 자체는 라이브 동작.
|
||
import Foundation
|
||
|
||
public struct LocalMLXProvider: AIProvider {
|
||
public let id: AIProviderID = .localMLX
|
||
|
||
/// 맥미니 허브 베이스 URL (S2-Fa config 에서 주입). trailing slash 없는 base, 경로는 appendingPathComponent.
|
||
public let baseURL: URL
|
||
let model: String
|
||
let session: URLSession
|
||
let requestTimeout: TimeInterval
|
||
let probeTimeout: TimeInterval
|
||
|
||
public init(
|
||
baseURL: URL,
|
||
model: String = "gemma-macmini",
|
||
session: URLSession = .shared,
|
||
requestTimeout: TimeInterval = 60,
|
||
probeTimeout: TimeInterval = 2
|
||
) {
|
||
self.baseURL = baseURL
|
||
self.model = model
|
||
self.session = session
|
||
self.requestTimeout = requestTimeout
|
||
self.probeTimeout = probeTimeout
|
||
}
|
||
|
||
// MARK: isAvailable — 경량 health probe (wake 미트리거)
|
||
|
||
public var isAvailable: Bool {
|
||
get async {
|
||
var req = URLRequest(url: baseURL.appendingPathComponent("v1/models"))
|
||
req.httpMethod = "GET"
|
||
req.timeoutInterval = probeTimeout
|
||
do {
|
||
let (_, resp) = try await session.data(for: req)
|
||
guard let http = resp as? HTTPURLResponse else { return false }
|
||
return (200..<300).contains(http.statusCode)
|
||
} catch {
|
||
// timeout/연결오류 → false(throw 아님). probe 는 wake/모델로드 유발 X.
|
||
// '가용=응답가능 ≠ 즉답' — 콜드 모델은 첫 complete() 가 로드지연 흡수.
|
||
return false
|
||
}
|
||
}
|
||
}
|
||
|
||
// MARK: complete — OpenAI 호환 chat/completions
|
||
|
||
public func complete(_ request: AICompletionRequest) async throws -> AICompletionResponse {
|
||
try Task.checkCancellation()
|
||
|
||
var req = URLRequest(url: baseURL.appendingPathComponent("v1/chat/completions"))
|
||
req.httpMethod = "POST"
|
||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||
req.timeoutInterval = requestTimeout // S2-Fe: 행 걸린 생성 무한정지 방지
|
||
req.httpBody = try Self.encodeRequest(request, model: model)
|
||
|
||
let started = Date()
|
||
let data: Data
|
||
let resp: URLResponse
|
||
do {
|
||
// URLSession async 는 Task 취소를 자동 honor → CancellationError 전파(S2-Fe).
|
||
(data, resp) = try await session.data(for: req)
|
||
} catch let e as URLError where e.code == .timedOut {
|
||
throw AIProviderError.backendError(id, status: -1, reason: "request timed out after \(Int(requestTimeout))s")
|
||
}
|
||
|
||
guard let http = resp as? HTTPURLResponse else {
|
||
throw AIProviderError.backendError(id, status: -1, reason: "non-HTTP response")
|
||
}
|
||
guard (200..<300).contains(http.statusCode) else {
|
||
// non-200 → backendError (빈 text 침묵 금지).
|
||
let reason = String(data: data, encoding: .utf8).map { String($0.prefix(300)) }
|
||
throw AIProviderError.backendError(id, status: http.statusCode, reason: reason)
|
||
}
|
||
|
||
let decoded = try JSONDecoder().decode(OpenAIChatResponse.self, from: data)
|
||
guard let choice = decoded.choices.first else {
|
||
throw AIProviderError.backendError(id, status: http.statusCode, reason: "no choices in response")
|
||
}
|
||
return AICompletionResponse(
|
||
text: choice.message.content,
|
||
providerUsed: .localMLX,
|
||
finishReason: Self.finishReason(choice.finishReason),
|
||
citations: [], // 로컬 생성 — 코퍼스 인용 없음
|
||
confidence: nil,
|
||
latencyMs: Date().timeIntervalSince(started) * 1000,
|
||
routingNote: nil // fallback note 는 라우터가 채움
|
||
)
|
||
}
|
||
|
||
// MARK: 매핑
|
||
|
||
static func finishReason(_ openAI: String?) -> AIFinishReason {
|
||
switch openAI {
|
||
case "stop": return .completed
|
||
case "length": return .completed // max_tokens 절단 — 정상 완료로 취급
|
||
default: return .completed
|
||
}
|
||
}
|
||
|
||
/// AICompletionRequest → OpenAI chat/completions body. messages system/user 분리(fixture 와 단일 source-of-truth).
|
||
/// system.content = systemPrompt ?? "" (plan S2-2c). temperature 는 AICompletionRequest 에 없음(동결) → 미설정(서버 기본).
|
||
static func encodeRequest(_ request: AICompletionRequest, model: String) throws -> Data {
|
||
let body = OpenAIChatRequest(
|
||
model: model,
|
||
messages: [
|
||
OpenAIChatRequest.Message(role: "system", content: request.systemPrompt ?? ""),
|
||
OpenAIChatRequest.Message(role: "user", content: request.prompt),
|
||
],
|
||
maxTokens: request.maxTokens,
|
||
stream: false
|
||
)
|
||
let enc = JSONEncoder()
|
||
enc.outputFormatting = [.sortedKeys]
|
||
return try enc.encode(body)
|
||
}
|
||
}
|
||
|
||
// MARK: - OpenAI 호환 wire 타입 (내부)
|
||
|
||
struct OpenAIChatRequest: Encodable, Sendable {
|
||
struct Message: Encodable, Sendable {
|
||
let role: String
|
||
let content: String
|
||
}
|
||
let model: String
|
||
let messages: [Message]
|
||
let maxTokens: Int?
|
||
let stream: Bool
|
||
|
||
enum CodingKeys: String, CodingKey {
|
||
case model, messages, stream
|
||
case maxTokens = "max_tokens"
|
||
}
|
||
}
|
||
|
||
struct OpenAIChatResponse: Decodable, Sendable {
|
||
struct Choice: Decodable, Sendable {
|
||
struct Message: Decodable, Sendable {
|
||
let role: String
|
||
let content: String
|
||
}
|
||
let index: Int?
|
||
let message: Message
|
||
let finishReason: String?
|
||
|
||
enum CodingKeys: String, CodingKey {
|
||
case index, message
|
||
case finishReason = "finish_reason"
|
||
}
|
||
}
|
||
let choices: [Choice]
|
||
}
|