"""Simple tokenizer wrapper with optional HuggingFace Tokenizers backend.

Provides a lightweight fallback tokenizer when the `tokenizers` package
is not available.
"""

from typing import List

try:
	from tokenizers import Tokenizer as HFTokenizer
	HF_AVAILABLE = True
except Exception:
	HF_AVAILABLE = False


class SimpleTokenizer:
	def __init__(self):
		self.sep = " "

	def encode(self, text: str) -> List[int]:
		# Extremely simple whitespace tokenizer fallback.
		# Not compatible with model vocabularies; use HF tokenizer when possible.
		return [hash(w) & 0xFFFF for w in text.split()]

	def decode(self, tokens: List[int]) -> str:
		return self.sep.join([str(t) for t in tokens])


class TokenizerWrapper:
	"""Wrapper that prefers a HuggingFace Tokenizers tokenizer if provided,
	otherwise falls back to a very small whitespace-based tokenizer.
	"""

	def __init__(self, tokenizer_path: str = None):
		if tokenizer_path and HF_AVAILABLE:
			self.tk = HFTokenizer.from_file(tokenizer_path)
			self.encode = lambda t: self.tk.encode(t).ids
			self.decode = lambda ids: self.tk.decode(ids)
		else:
			self.tk = SimpleTokenizer()
			self.encode = self.tk.encode
			self.decode = self.tk.decode