Files
hyungi_document_server/Sources/AI/Providers/LocalMLXProvider.swift
T
hyungi 5383a93f98 feat(ai-fabric): S2 LLM 패브릭 4 provider 결선 + 컴포지션 루트
risk-first 채움(RemoteDS→LocalMLX→OnDevice→Specialized) + makeDefaultRouter 컴포지션 루트.
동결 인터페이스(AIProvider/AIRouter/MockAIProvider) 무변경. SPM AIFabric 단독 빌드·테스트(46 PASS).

- RemoteDS: DSAskClient seam + AskResponse(ask.json) 매핑 + backend exhaustive switch(qwen/cloud TODO)
- LocalMLX: GET /v1/models probe + OpenAI /v1/chat/completions system/user call-shape + non-200 backendError
- OnDevice: FoundationModels 라이브(M5 Max) availability + respond() + GenerationError 9-case 매핑 + stateless/prewarm
- Specialized: scaffold-only(명시 unavailable, vision 폴백 가시화), cloud='claude-cloud' 503
- config 단일소스(env override) + 타임아웃/취소(URLSession 자동 honor, OnDevice 협조적)

실측 동결(S2-3a, M5 Max): availability=available · 취소=COOPERATIVE(~33ms) · 오버플로=exceededContextWindowSize
  · GenerationError 9-case(refusal·concurrentRequests 추가 발견, plan 정정).
한계: LocalMLX fixture=PROVISIONAL_SYNTHETIC(맥미니 offline → 라이브 재캡처 S2-Ff 대기).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 17:20:10 +09:00

162 lines
6.4 KiB
Swift
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// LocalMLXProvider.swift S2 ( LLM ).
//
// Gemma 4 26B, llm-router :8890 (OpenAI , wake-on-call). #4: raw MLX :8801 .
// - isAvailable = GET /v1/models probe( timeout, wake ' ' )
// - complete = POST /v1/chat/completions, messages system/user (call-shape )
//
// fixture(llm-router-chat.*.json) = PROVISIONAL_SYNTHETIC ( offline , 2026-06-04).
// OpenAI . S2-2a (S2-Ff). .
import Foundation
public struct LocalMLXProvider: AIProvider {
public let id: AIProviderID = .localMLX
/// URL (S2-Fa config ). trailing slash base, appendingPathComponent.
public let baseURL: URL
let model: String
let session: URLSession
let requestTimeout: TimeInterval
let probeTimeout: TimeInterval
public init(
baseURL: URL,
model: String = "gemma-macmini",
session: URLSession = .shared,
requestTimeout: TimeInterval = 60,
probeTimeout: TimeInterval = 2
) {
self.baseURL = baseURL
self.model = model
self.session = session
self.requestTimeout = requestTimeout
self.probeTimeout = probeTimeout
}
// MARK: isAvailable health probe (wake )
public var isAvailable: Bool {
get async {
var req = URLRequest(url: baseURL.appendingPathComponent("v1/models"))
req.httpMethod = "GET"
req.timeoutInterval = probeTimeout
do {
let (_, resp) = try await session.data(for: req)
guard let http = resp as? HTTPURLResponse else { return false }
return (200..<300).contains(http.statusCode)
} catch {
// timeout/ false(throw ). probe wake/ X.
// '= ' complete() .
return false
}
}
}
// MARK: complete OpenAI chat/completions
public func complete(_ request: AICompletionRequest) async throws -> AICompletionResponse {
try Task.checkCancellation()
var req = URLRequest(url: baseURL.appendingPathComponent("v1/chat/completions"))
req.httpMethod = "POST"
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.timeoutInterval = requestTimeout // S2-Fe:
req.httpBody = try Self.encodeRequest(request, model: model)
let started = Date()
let data: Data
let resp: URLResponse
do {
// URLSession async Task honor CancellationError (S2-Fe).
(data, resp) = try await session.data(for: req)
} catch let e as URLError where e.code == .timedOut {
throw AIProviderError.backendError(id, status: -1, reason: "request timed out after \(Int(requestTimeout))s")
}
guard let http = resp as? HTTPURLResponse else {
throw AIProviderError.backendError(id, status: -1, reason: "non-HTTP response")
}
guard (200..<300).contains(http.statusCode) else {
// non-200 backendError ( text ).
let reason = String(data: data, encoding: .utf8).map { String($0.prefix(300)) }
throw AIProviderError.backendError(id, status: http.statusCode, reason: reason)
}
let decoded = try JSONDecoder().decode(OpenAIChatResponse.self, from: data)
guard let choice = decoded.choices.first else {
throw AIProviderError.backendError(id, status: http.statusCode, reason: "no choices in response")
}
return AICompletionResponse(
text: choice.message.content,
providerUsed: .localMLX,
finishReason: Self.finishReason(choice.finishReason),
citations: [], //
confidence: nil,
latencyMs: Date().timeIntervalSince(started) * 1000,
routingNote: nil // fallback note
)
}
// MARK:
static func finishReason(_ openAI: String?) -> AIFinishReason {
switch openAI {
case "stop": return .completed
case "length": return .completed // max_tokens
default: return .completed
}
}
/// AICompletionRequest OpenAI chat/completions body. messages system/user (fixture source-of-truth).
/// system.content = systemPrompt ?? "" (plan S2-2c). temperature AICompletionRequest () ( ).
static func encodeRequest(_ request: AICompletionRequest, model: String) throws -> Data {
let body = OpenAIChatRequest(
model: model,
messages: [
OpenAIChatRequest.Message(role: "system", content: request.systemPrompt ?? ""),
OpenAIChatRequest.Message(role: "user", content: request.prompt),
],
maxTokens: request.maxTokens,
stream: false
)
let enc = JSONEncoder()
enc.outputFormatting = [.sortedKeys]
return try enc.encode(body)
}
}
// MARK: - OpenAI wire ()
struct OpenAIChatRequest: Encodable, Sendable {
struct Message: Encodable, Sendable {
let role: String
let content: String
}
let model: String
let messages: [Message]
let maxTokens: Int?
let stream: Bool
enum CodingKeys: String, CodingKey {
case model, messages, stream
case maxTokens = "max_tokens"
}
}
struct OpenAIChatResponse: Decodable, Sendable {
struct Choice: Decodable, Sendable {
struct Message: Decodable, Sendable {
let role: String
let content: String
}
let index: Int?
let message: Message
let finishReason: String?
enum CodingKeys: String, CodingKey {
case index, message
case finishReason = "finish_reason"
}
}
let choices: [Choice]
}