
import re

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
from Products.ZCTextIndex.StopDict import get_stopdict

enc = 'utf-8'

class UnicodeWordSplitter:

    __implements__ = ISplitter

    word = re.compile(r"(?u)\w+")
    wordGlob = re.compile(r"(?u)\w+[\w*?]*")
    html = re.compile(r"(?u)<[^<>]*>|&[A-Za-z0-9#]+;")

    def process(self, lst, glob=False, strip_html=False):
        result = []
        for w in lst:
            if not isinstance(w, unicode):
                w = unicode(w, enc)
            if strip_html:
                w = self.html.sub(' ', w)
            if glob:
                result += self.wordGlob.findall(w)
            else:
                result += self.word.findall(w)
        return result

    def processGlob(self, lst):
        return self.process(lst, True)

element_factory.registerFactory('Word Splitter',
                                'Whitespace splitter (Unicode)',
                                UnicodeWordSplitter)


class UnicodeHTMLWordSplitter(UnicodeWordSplitter):

    __implements__ = ISplitter

    def process(self, lst, glob=False):
        return UnicodeWordSplitter.process(self, lst, glob, True)

element_factory.registerFactory('Word Splitter',
                                'HTML aware splitter (Unicode)',
                                UnicodeHTMLWordSplitter)


class UnicodeCaseNormalizer:

    def process(self, lst):
        result = []
        for w in lst:
            if not isinstance(w, unicode):
                w = unicode(w, enc)
            result.append(w.lower())
        return result

element_factory.registerFactory('Case Normalizer',
                                'Case Normalizer (Unicode)',
                                UnicodeCaseNormalizer)


class UnicodeStopWordRemover:

    dict = {}
    for k in get_stopdict():
        if not isinstance(k, unicode):
            k = unicode(k, enc)
        dict[k] = None

    def process(self, lst):
        result = []
        for w in lst:
            if not isinstance(w, unicode):
                w = unicode(w, enc)
            if not self.dict.has_key(w):
                result.append(w)
        return result

element_factory.registerFactory('Stop Words',
                                'Remove listed stop words only (Unicode)',
                                UnicodeStopWordRemover)

