Source code for chana.lemmatizer

#coding=UTF-8
"""
Lemmatizer for shipibo-konibo
Source model is from the Chana project and a use KNeighborsClassifier from scikit-learn
"""
import codecs
import os
import numpy as np
from sklearn.externals import joblib
from sklearn import neighbors
import warnings

warnings.filterwarnings("ignore")

[docs]def replace_last(source_string, replace_what, replace_with): """ Function that replaces the last ocurrence of a string in a word :param source_string: the source string :type source_string: str :param replace_what: the substring to be replaced :type replace_what: str :param replace_with: the string to be inserted :type replace_with: str :returns: string with the replacement :rtype: str :Example: >>> import chana.lemmatizer >>> chana.lemmatizer.replace_last('piati','ti','ra') 'piara' """ head, _sep, tail = source_string.rpartition(replace_what) return head + replace_with + tail
[docs]def longest_common_substring(string1, string2): """ Function to find the longest common substring of two strings :param string1: string1 :type string1: str :param string2: string2 :type string2: str :returns: longest common substring :rtype: str :Example: >>> import chana.lemmatizer >>> chana.lemmatizer.longest_common_substring('limanko','limanra') 'liman' """ m = [[0] * (1 + len(string2)) for i in range(1 + len(string1))] longest, x_longest = 0, 0 for x in range(1, 1 + len(string1)): for y in range(1, 1 + len(string2)): if string1[x - 1] == string2[y - 1]: m[x][y] = m[x - 1][y - 1] + 1 if m[x][y] > longest: longest = m[x][y] x_longest = x else: m[x][y] = 0 return string1[x_longest - longest: x_longest]
[docs]def has_shipibo_suffix(str): """ Function that returns the possible existence of a shipo suffix in a a word :param str: word to evaluate :type str: str :returns: True or False :rtype: bool :Example: >>> import chana.lemmatizer >>> chana.lemmatizer.has_shipibo_suffix('pianra') True """ my_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(my_path, "files/lemmatizer/shipibo_suffixes.dat") suffixes = codecs.open(path, "r", "utf-8") lines = suffixes.read().splitlines() lines = tuple(lines) if str.endswith(lines): return True else: return False
[docs]def shipibo_suffixes(): """ Function that returns a list with all the shipibo suffixes :returns: list with all the suffixes :rtype: list :Example: >>> import chana.lemmatizer >>> chana.lemmatizer.shipibo_suffixes() ['naan', 'yama', 'men', 'iosma', ..., 'shoko'] """ my_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(my_path, "files/lemmatizer/shipibo_suffixes.dat") suffixes = codecs.open(path, "r", "utf-8") shipibo_suffixes = suffixes.read().splitlines() return(shipibo_suffixes)
[docs]class ShipiboLemmatizer: """ Instance of the pre-trained shipibo lemmatizer """ def __init__(self): """ Constructor of the class that loads the pretrained model """ my_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(my_path, "files/lemmatizer/shipibo_knn_model.pkl") self.lemmatizer = joblib.load(path) self.features_length = 18
[docs] def preprocess_word(self, word): """ Method that turns a word in an array of features for the classifier :param word: a word to be transformed :type word: str :returns: list with the features :rtype: list :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer() >>> lemmatizer.preprocess_word('shipibobo') [111, 98, 111, 98, 105, 112, 105, 104, 115, 0, 0, 0, 0, 0, 0, 0, 0, 0] """ features = [0 for x in range(self.features_length)] i = 0 pal = reversed(list(word)) for letter in pal: features[i]=ord(letter) i+=1 return features
[docs] def get_lemma(self, rule, word): """ Method that returns the lemma of a shipibo word given a possible rule :param rule: a rule to transform a word :type rule: list :param word: a word to be transformed :type word: str :returns: word transformed :rtype: str :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer() >>> lemmatizer.get_lemma(['bo>'],'shipibobo') 'shipibo' """ rule = (rule[0].split('>')) substract = rule[0] add = rule[1] if substract=='': return word+add elif word.endswith(substract): lemma = replace_last(word,substract,add) return lemma else: return word
[docs] def get_rule(self, word): """ Method that returns the transformation rule for a shipibo word :param word: a word to get the rule :type word: str :returns: numpy array with the rule :rtype: array :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer() >>> lemmatizer.get_rule('pikanwe') array(['anwe>i'], dtype='<U16') """ lemma_num = self.preprocess_word(word) lemma_num = np.array(lemma_num).reshape(1 ,-1) rule = self.lemmatizer.predict(lemma_num) return rule
[docs] def lemmatize(self, word): """ Method that predicts the lemma of a shipibo word :param word: a word to get the lemma :type word: str :returns: lemma of the word :rtype: str :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer() >>> lemmatizer.lemmatize('pikanwe') 'piki' """ if has_shipibo_suffix(word): rule = self.get_rule(word) lemma = self.get_lemma(rule, word) return lemma else: return word
[docs]class GeneralLemmatizer: """ Instance of a new lemmatizer to be trained and used """ def __init__(self, features_length = 10, n_neighbors = 5): """ Constructor of the class with the number of features to be used by the lemmatizer :param features_length: number of features to be used :type features_length: int :param n_neighbors: number of neighbors to be used :type n_neighbors: int """ self.features_length = features_length self.n_neighbors = n_neighbors self.lemmatizer = None
[docs] def train(self, words, lemmas): """ Method that trains a new lemmatizer with a list of words and a list of lemmas of the same size :param words: list of words :type words: list :param lemmas: list of lemmas :type lemmas: list :returns: none :rtype: None :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.GeneralLemmatizer() >>> lemmas = ['perro','gato','mono'] >>> words = ['perritos','gatitos','monotes'] >>> lemmatizer.train(words,lemmas) """ if len(words) != len(lemmas): return 'Both arrays must be of the same size' if len(words) < self.n_neighbors: return 'The number of words to train must be greater than the number of neighbors to predict' array_clases=[] array_features = [[0 for x in range(self.features_length)] for y in range(len(words))] iterator = 0 for word, lemma in zip(words, lemmas): sub_string = longest_common_substring(word,lemma) left = word.replace(sub_string, "") right = lemma.replace(sub_string, "") array_clases.append(left+">"+right) word = reversed(word) let = 0 for letter in word: if(let<self.features_length): array_features[iterator][let] = ord(letter) let += 1 iterator += 1 model = neighbors.KNeighborsClassifier(n_neighbors=self.n_neighbors, metric='hamming') model.fit(array_features, array_clases) self.lemmatizer = model
[docs] def preprocess_word(self,word): """ Method that turns a word in an array of features for the classifier according to its features_length :param word: a word to be transformed :type word: str :returns: list with the features :rtype: list :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.GeneralLemmatizer() >>> lemmatizer.preprocess_word('perritos') [115, 111, 116, 105, 114, 114, 101, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] """ features = [0 for x in range(self.features_length)] i = 0 pal = reversed(list(word)) for letter in pal: features[i]=ord(letter) i+=1 return features
[docs] def get_lemma(self, rule, word): """ Method that returns the lemma of a word given a possible rule :param rule: a rule to transform a word :type rule: list :param word: a word to be transformed :type word: str :returns: word transformed :rtype: str :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.GeneralLemmatizer() >>> lemmatizer.get_lemma(['bo>'],'shipibobo') 'shipibo' """ rule = (rule[0].split('>')) substract = rule[0] add = rule[1] if substract=='': return word+add elif word.endswith(substract): lemma = replace_last(word,substract,add) return lemma else: return word
[docs] def get_rule(self, word): """ Method that returns the transformation rule for a word :param word: a word to get the rule :type word: str :returns: numpy array with the rule :rtype: array :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.GeneralLemmatizer() >>> lemmatizer.get_rule('perrito') array(['ito>0'], dtype='<U16') """ lemma_num = self.preprocess_word(word) lemma_num = np.array(lemma_num).reshape(1 ,-1) rule = self.lemmatizer.predict(lemma_num) return rule
[docs] def lemmatize(self, word): """ Method that predicts the lemma of a word with the trained model :param word: a word to get the lemma :type word: str :returns: lemma of the word :rtype: str :Example: >>> import chana.lemmatizer >>> lemmatizer = chana.lemmatizer.GeneralLemmatizer() >>> lemmatizer.lemmatize('perrito') 'perro' """ if self.lemmatizer == None: return 'The lemmatizer must be trained first' rule = self.get_rule(word) lemma = self.get_lemma(rule, word) return lemma