Files
syn-chat-bot/.venv/lib/python3.9/site-packages/icalendar_searcher/collation.py
Hyungi Ahn c2257d3a86 fix: 포트 충돌 회피 — note_bridge 8098, intent_service 8099
Jellyfin(8096), OrbStack(8097) 포트 충돌으로 변경.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 13:53:55 +09:00

233 lines
7.8 KiB
Python

"""Text collation support for string comparisons.
This module provides collation (text comparison) functionality with optional
PyICU support for advanced Unicode collation. Falls back to simple binary
and case-insensitive comparisons when PyICU is not available.
"""
from __future__ import annotations
from collections.abc import Callable
from enum import Enum
# Try to import PyICU for advanced collation support
try:
from icu import Collator as ICUCollator
from icu import Locale as ICULocale
HAS_PYICU = True
except ImportError:
HAS_PYICU = False
class Collation(str, Enum):
"""Text comparison collation strategies.
For most users, use case_sensitive parameter in add_property_filter()
instead of working with Collation directly.
Examples:
# Simple API (recommended for most users):
searcher.add_property_filter("SUMMARY", "meeting", case_sensitive=False)
# Advanced API (for power users):
searcher.add_property_filter("SUMMARY", "Müller",
collation=Collation.LOCALE,
locale="de_DE")
"""
SIMPLE = "simple"
"""Simple Python-based collation (no PyICU required).
- case_sensitive=True: Byte-for-byte comparison
- case_sensitive=False: Python's str.lower() comparison
"""
UNICODE = "unicode"
"""Unicode Collation Algorithm (UCA) root collation.
- case_sensitive=True: ICU TERTIARY strength (distinguishes case)
- case_sensitive=False: ICU SECONDARY strength (ignores case)
Requires PyICU to be installed."""
LOCALE = "locale"
"""Locale-aware collation using CLDR rules.
- case_sensitive=True: ICU TERTIARY strength (distinguishes case)
- case_sensitive=False: ICU SECONDARY strength (ignores case)
Requires PyICU to be installed and locale parameter."""
class CollationError(Exception):
"""Raised when collation operation cannot be performed."""
pass
def get_collation_function(
collation: Collation = Collation.SIMPLE,
case_sensitive: bool = True,
locale: str | None = None,
) -> Callable[[str, str], bool]:
"""Get a collation function for substring matching.
Args:
collation: The collation strategy to use
case_sensitive: Whether comparison should be case-sensitive
locale: Locale string (e.g., "de_DE", "en_US") for LOCALE collation
Returns:
A function that takes (needle, haystack) and returns True if needle
is found in haystack according to the collation rules.
Raises:
CollationError: If PyICU is required but not available, or if
invalid parameters are provided.
Examples:
>>> match_fn = get_collation_function(Collation.SIMPLE, case_sensitive=False)
>>> match_fn("test", "This is a TEST")
True
"""
if collation == Collation.SIMPLE:
if case_sensitive:
return _binary_contains
else:
return _case_insensitive_contains
elif collation in (Collation.UNICODE, Collation.LOCALE):
if not HAS_PYICU:
raise CollationError(
f"Collation '{collation}' requires PyICU to be installed. "
"Install with: pip install 'icalendar-searcher[collation]'"
)
if collation == Collation.LOCALE:
if not locale:
raise CollationError("LOCALE collation requires a locale parameter")
return _get_icu_contains(locale, case_sensitive)
else:
# UNICODE collation uses root locale
return _get_icu_contains(None, case_sensitive)
else:
raise CollationError(f"Unknown collation: {collation}")
def get_sort_key_function(
collation: Collation = Collation.SIMPLE,
case_sensitive: bool = True,
locale: str | None = None,
) -> Callable[[str], bytes]:
"""Get a collation function for generating sort keys.
Args:
collation: The collation strategy to use
case_sensitive: Whether comparison should be case-sensitive
locale: Locale string (e.g., "de_DE", "en_US") for LOCALE collation
Returns:
A function that takes a string and returns a sort key (bytes) that
can be used for sorting according to the collation rules.
Raises:
CollationError: If PyICU is required but not available, or if
invalid parameters are provided.
Examples:
>>> sort_key_fn = get_sort_key_function(Collation.SIMPLE, case_sensitive=False)
>>> sorted(["Zebra", "apple", "Banana"], key=sort_key_fn)
['apple', 'Banana', 'Zebra']
"""
if collation == Collation.SIMPLE:
if case_sensitive:
return lambda s: s.encode("utf-8")
else:
return lambda s: s.lower().encode("utf-8")
elif collation in (Collation.UNICODE, Collation.LOCALE):
if not HAS_PYICU:
raise CollationError(
f"Collation '{collation}' requires PyICU to be installed. "
"Install with: pip install 'icalendar-searcher[collation]'"
)
if collation == Collation.LOCALE:
if not locale:
raise CollationError("LOCALE collation requires a locale parameter")
return _get_icu_sort_key(locale, case_sensitive)
else:
# UNICODE collation uses root locale
return _get_icu_sort_key(None, case_sensitive)
else:
raise CollationError(f"Unknown collation: {collation}")
# ============================================================================
# Internal implementation functions
# ============================================================================
def _binary_contains(needle: str, haystack: str) -> bool:
"""Binary (case-sensitive) substring match."""
return needle in haystack
def _case_insensitive_contains(needle: str, haystack: str) -> bool:
"""Case-insensitive substring match."""
return needle.lower() in haystack.lower()
def _get_icu_contains(locale: str | None, case_sensitive: bool) -> Callable[[str, str], bool]:
"""Get ICU-based substring matcher.
Note: This is a simplified implementation. PyICU doesn't expose ICU's
StringSearch API which would be needed for proper substring matching with
collation. For now, we use Python's built-in matching.
Future enhancement: Implement proper collation-aware substring matching.
"""
def icu_contains(needle: str, haystack: str) -> bool:
"""Check if needle is in haystack.
This is a fallback implementation until proper ICU StringSearch support
is added. It provides reasonable behavior for most use cases.
"""
# TODO: Use ICU StringSearch for proper collation-aware substring matching
# For now, fall back to Python's built-in contains
if case_sensitive:
return needle in haystack
else:
return needle.lower() in haystack.lower()
return icu_contains
def _get_icu_sort_key(locale: str | None, case_sensitive: bool) -> Callable[[str], bytes]:
"""Get ICU-based sort key function.
Creates a collator instance and returns a function that generates sort keys.
The collator strength is configured based on case_sensitive parameter.
"""
icu_locale = ICULocale(locale) if locale else ICULocale.getRoot()
collator = ICUCollator.createInstance(icu_locale)
# Set strength based on case sensitivity:
# PRIMARY = base character differences only
# SECONDARY = base + accent differences (case-insensitive)
# TERTIARY = base + accent + case differences (case-sensitive, default)
if case_sensitive:
collator.setStrength(ICUCollator.TERTIARY)
else:
collator.setStrength(ICUCollator.SECONDARY)
def icu_sort_key(s: str) -> bytes:
"""Generate ICU collation sort key."""
return collator.getSortKey(s)
return icu_sort_key