Source code for condor.normalize

'''
This module contains utilities to normalize words and simplify them, it should
be able to remove punctuation, filter out stopwords and maybe transform some
latex accents into unicode accent characters.
'''

import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


[docs]class SpaceTokenizer(object): ''' Simple tokenization based on spaces '''
[docs] def tokenize(self, text): return text.split()
[docs]class Normalizer(object): default_tokenizer = SpaceTokenizer default_language = 'spanish' def __init__(self, language=None, tokenizer=None): if language is not None: self.language = language.lower() else: self.language = self.default_language self.tokenizer = tokenizer or self.default_tokenizer()
[docs] def apply_to(self, text): return text
[docs]class PunctuationRemover(Normalizer): ''' Removes punctuation from a text ''' characters = string.punctuation + '¡¿“”‘’—\'' def __init__(self, characters=None, **kwargs): self.translation = str.maketrans( dict.fromkeys(characters or self.characters) ) super().__init__(**kwargs)
[docs] def apply_to(self, text): return super().apply_to(text.translate(self.translation))
[docs]class Stemmer(Normalizer): ''' Changes words to their respective stemms ''' def __init__(self, **kwargs): super().__init__(**kwargs) self.stemmer = SnowballStemmer(self.language)
[docs] def apply_to(self, text): tokens = self.tokenizer.tokenize(text) result = ' '.join(self.stemmer.stem(token) for token in tokens) return super().apply_to(result)
[docs]class StopwordRemover(Normalizer): ''' Removes stopwords from a text ''' def __init__(self, **kwargs): super().__init__(**kwargs) self.stopwords = { word: None for word in stopwords.words(fileids=self.language) }
[docs] def apply_to(self, text): tokens = self.tokenizer.tokenize(text) result = ' '.join( token for token in tokens if token not in self.stopwords ) return super().apply_to(result)
[docs]class Lowercaser(Normalizer): ''' Changes case to lowercase through the normalizer API '''
[docs] def apply_to(self, text): return super().apply_to(text.lower())
[docs]class LatexAccentRemover(Normalizer): ''' Removes latex accents like `\\'{a}` and makes them unicode chars `á`. ''' accents = { '\'': { 'a': 'á', 'e': 'é', 'i': 'í', 'o': 'ó', 'u': 'ú', 'c': 'ć' }, '`': { 'a': 'à', 'e': 'è', 'i': 'ì', 'o': 'ò', 'u': 'ù', }, '~': { 'n': 'ñ', 'o': 'õ', 'a': 'ã', }, '^': { 'a': 'â', 'e': 'ê', 'i': 'î', 'o': 'ô', 'u': 'û', }, '"': { 'a': 'ä', 'e': 'ë', 'i': 'ï', 'o': 'ö', 'u': 'ü', 'y': 'ÿ', }, 'a': { 'e': 'æ', }, 'c': { 'c': 'ç', }, 'o': { 'e': 'œ', }, 's': { 's': 'ß', }, 'v': { 's': 'š', }, } formats = [ r"{{\{accent}{{{character}}}}}", r"{{\{accent}{character}}}", r"\{accent}{{{character}}}", r"\{accent}{character}", ] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def _replacements(self): for accent, cases in self.accents.items(): for character, modification in cases.items(): for format_ in self.formats: latex = format_.format(accent=accent, character=character) yield ( latex, modification, ) yield ( latex.upper(), modification.upper(), )
[docs] def apply_to(self, text): result = text for old, new in self._replacements(): result = result.replace(old, new) return super().apply_to(result)
[docs]class CompleteNormalizer(LatexAccentRemover, PunctuationRemover, Lowercaser, StopwordRemover, Stemmer): ''' A Normalizer that aggregates all the effects described in this module ''' pass