Jellyfin(8096), OrbStack(8097) 포트 충돌으로 변경. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
233 lines
7.8 KiB
Python
233 lines
7.8 KiB
Python
"""Text collation support for string comparisons.
|
|
|
|
This module provides collation (text comparison) functionality with optional
|
|
PyICU support for advanced Unicode collation. Falls back to simple binary
|
|
and case-insensitive comparisons when PyICU is not available.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Callable
|
|
from enum import Enum
|
|
|
|
# Try to import PyICU for advanced collation support
|
|
try:
|
|
from icu import Collator as ICUCollator
|
|
from icu import Locale as ICULocale
|
|
|
|
HAS_PYICU = True
|
|
except ImportError:
|
|
HAS_PYICU = False
|
|
|
|
|
|
class Collation(str, Enum):
|
|
"""Text comparison collation strategies.
|
|
|
|
For most users, use case_sensitive parameter in add_property_filter()
|
|
instead of working with Collation directly.
|
|
|
|
Examples:
|
|
# Simple API (recommended for most users):
|
|
searcher.add_property_filter("SUMMARY", "meeting", case_sensitive=False)
|
|
|
|
# Advanced API (for power users):
|
|
searcher.add_property_filter("SUMMARY", "Müller",
|
|
collation=Collation.LOCALE,
|
|
locale="de_DE")
|
|
"""
|
|
|
|
SIMPLE = "simple"
|
|
"""Simple Python-based collation (no PyICU required).
|
|
|
|
- case_sensitive=True: Byte-for-byte comparison
|
|
- case_sensitive=False: Python's str.lower() comparison
|
|
"""
|
|
|
|
UNICODE = "unicode"
|
|
"""Unicode Collation Algorithm (UCA) root collation.
|
|
|
|
- case_sensitive=True: ICU TERTIARY strength (distinguishes case)
|
|
- case_sensitive=False: ICU SECONDARY strength (ignores case)
|
|
|
|
Requires PyICU to be installed."""
|
|
|
|
LOCALE = "locale"
|
|
"""Locale-aware collation using CLDR rules.
|
|
|
|
- case_sensitive=True: ICU TERTIARY strength (distinguishes case)
|
|
- case_sensitive=False: ICU SECONDARY strength (ignores case)
|
|
|
|
Requires PyICU to be installed and locale parameter."""
|
|
|
|
|
|
class CollationError(Exception):
|
|
"""Raised when collation operation cannot be performed."""
|
|
|
|
pass
|
|
|
|
|
|
def get_collation_function(
|
|
collation: Collation = Collation.SIMPLE,
|
|
case_sensitive: bool = True,
|
|
locale: str | None = None,
|
|
) -> Callable[[str, str], bool]:
|
|
"""Get a collation function for substring matching.
|
|
|
|
Args:
|
|
collation: The collation strategy to use
|
|
case_sensitive: Whether comparison should be case-sensitive
|
|
locale: Locale string (e.g., "de_DE", "en_US") for LOCALE collation
|
|
|
|
Returns:
|
|
A function that takes (needle, haystack) and returns True if needle
|
|
is found in haystack according to the collation rules.
|
|
|
|
Raises:
|
|
CollationError: If PyICU is required but not available, or if
|
|
invalid parameters are provided.
|
|
|
|
Examples:
|
|
>>> match_fn = get_collation_function(Collation.SIMPLE, case_sensitive=False)
|
|
>>> match_fn("test", "This is a TEST")
|
|
True
|
|
"""
|
|
if collation == Collation.SIMPLE:
|
|
if case_sensitive:
|
|
return _binary_contains
|
|
else:
|
|
return _case_insensitive_contains
|
|
|
|
elif collation in (Collation.UNICODE, Collation.LOCALE):
|
|
if not HAS_PYICU:
|
|
raise CollationError(
|
|
f"Collation '{collation}' requires PyICU to be installed. "
|
|
"Install with: pip install 'icalendar-searcher[collation]'"
|
|
)
|
|
|
|
if collation == Collation.LOCALE:
|
|
if not locale:
|
|
raise CollationError("LOCALE collation requires a locale parameter")
|
|
return _get_icu_contains(locale, case_sensitive)
|
|
else:
|
|
# UNICODE collation uses root locale
|
|
return _get_icu_contains(None, case_sensitive)
|
|
|
|
else:
|
|
raise CollationError(f"Unknown collation: {collation}")
|
|
|
|
|
|
def get_sort_key_function(
|
|
collation: Collation = Collation.SIMPLE,
|
|
case_sensitive: bool = True,
|
|
locale: str | None = None,
|
|
) -> Callable[[str], bytes]:
|
|
"""Get a collation function for generating sort keys.
|
|
|
|
Args:
|
|
collation: The collation strategy to use
|
|
case_sensitive: Whether comparison should be case-sensitive
|
|
locale: Locale string (e.g., "de_DE", "en_US") for LOCALE collation
|
|
|
|
Returns:
|
|
A function that takes a string and returns a sort key (bytes) that
|
|
can be used for sorting according to the collation rules.
|
|
|
|
Raises:
|
|
CollationError: If PyICU is required but not available, or if
|
|
invalid parameters are provided.
|
|
|
|
Examples:
|
|
>>> sort_key_fn = get_sort_key_function(Collation.SIMPLE, case_sensitive=False)
|
|
>>> sorted(["Zebra", "apple", "Banana"], key=sort_key_fn)
|
|
['apple', 'Banana', 'Zebra']
|
|
"""
|
|
if collation == Collation.SIMPLE:
|
|
if case_sensitive:
|
|
return lambda s: s.encode("utf-8")
|
|
else:
|
|
return lambda s: s.lower().encode("utf-8")
|
|
|
|
elif collation in (Collation.UNICODE, Collation.LOCALE):
|
|
if not HAS_PYICU:
|
|
raise CollationError(
|
|
f"Collation '{collation}' requires PyICU to be installed. "
|
|
"Install with: pip install 'icalendar-searcher[collation]'"
|
|
)
|
|
|
|
if collation == Collation.LOCALE:
|
|
if not locale:
|
|
raise CollationError("LOCALE collation requires a locale parameter")
|
|
return _get_icu_sort_key(locale, case_sensitive)
|
|
else:
|
|
# UNICODE collation uses root locale
|
|
return _get_icu_sort_key(None, case_sensitive)
|
|
|
|
else:
|
|
raise CollationError(f"Unknown collation: {collation}")
|
|
|
|
|
|
# ============================================================================
|
|
# Internal implementation functions
|
|
# ============================================================================
|
|
|
|
|
|
def _binary_contains(needle: str, haystack: str) -> bool:
|
|
"""Binary (case-sensitive) substring match."""
|
|
return needle in haystack
|
|
|
|
|
|
def _case_insensitive_contains(needle: str, haystack: str) -> bool:
|
|
"""Case-insensitive substring match."""
|
|
return needle.lower() in haystack.lower()
|
|
|
|
|
|
def _get_icu_contains(locale: str | None, case_sensitive: bool) -> Callable[[str, str], bool]:
|
|
"""Get ICU-based substring matcher.
|
|
|
|
Note: This is a simplified implementation. PyICU doesn't expose ICU's
|
|
StringSearch API which would be needed for proper substring matching with
|
|
collation. For now, we use Python's built-in matching.
|
|
|
|
Future enhancement: Implement proper collation-aware substring matching.
|
|
"""
|
|
|
|
def icu_contains(needle: str, haystack: str) -> bool:
|
|
"""Check if needle is in haystack.
|
|
|
|
This is a fallback implementation until proper ICU StringSearch support
|
|
is added. It provides reasonable behavior for most use cases.
|
|
"""
|
|
# TODO: Use ICU StringSearch for proper collation-aware substring matching
|
|
# For now, fall back to Python's built-in contains
|
|
if case_sensitive:
|
|
return needle in haystack
|
|
else:
|
|
return needle.lower() in haystack.lower()
|
|
|
|
return icu_contains
|
|
|
|
|
|
def _get_icu_sort_key(locale: str | None, case_sensitive: bool) -> Callable[[str], bytes]:
|
|
"""Get ICU-based sort key function.
|
|
|
|
Creates a collator instance and returns a function that generates sort keys.
|
|
The collator strength is configured based on case_sensitive parameter.
|
|
"""
|
|
icu_locale = ICULocale(locale) if locale else ICULocale.getRoot()
|
|
collator = ICUCollator.createInstance(icu_locale)
|
|
|
|
# Set strength based on case sensitivity:
|
|
# PRIMARY = base character differences only
|
|
# SECONDARY = base + accent differences (case-insensitive)
|
|
# TERTIARY = base + accent + case differences (case-sensitive, default)
|
|
if case_sensitive:
|
|
collator.setStrength(ICUCollator.TERTIARY)
|
|
else:
|
|
collator.setStrength(ICUCollator.SECONDARY)
|
|
|
|
def icu_sort_key(s: str) -> bytes:
|
|
"""Generate ICU collation sort key."""
|
|
return collator.getSortKey(s)
|
|
|
|
return icu_sort_key
|