#coding=UTF-8
"""
Lemmatizer for shipibo-konibo
Source model is from the Chana project and a use KNeighborsClassifier from scikit-learn
"""
import codecs
import os
import numpy as np
from sklearn.externals import joblib
from sklearn import neighbors
import warnings
warnings.filterwarnings("ignore")
[docs]def replace_last(source_string, replace_what, replace_with):
""" Function that replaces the last ocurrence of a string in a word
:param source_string: the source string
:type source_string: str
:param replace_what: the substring to be replaced
:type replace_what: str
:param replace_with: the string to be inserted
:type replace_with: str
:returns: string with the replacement
:rtype: str
:Example:
>>> import chana.lemmatizer
>>> chana.lemmatizer.replace_last('piati','ti','ra')
'piara'
"""
head, _sep, tail = source_string.rpartition(replace_what)
return head + replace_with + tail
[docs]def longest_common_substring(string1, string2):
""" Function to find the longest common substring of two strings
:param string1: string1
:type string1: str
:param string2: string2
:type string2: str
:returns: longest common substring
:rtype: str
:Example:
>>> import chana.lemmatizer
>>> chana.lemmatizer.longest_common_substring('limanko','limanra')
'liman'
"""
m = [[0] * (1 + len(string2)) for i in range(1 + len(string1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(string1)):
for y in range(1, 1 + len(string2)):
if string1[x - 1] == string2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
return string1[x_longest - longest: x_longest]
[docs]def has_shipibo_suffix(str):
""" Function that returns the possible existence of a shipo suffix in a a word
:param str: word to evaluate
:type str: str
:returns: True or False
:rtype: bool
:Example:
>>> import chana.lemmatizer
>>> chana.lemmatizer.has_shipibo_suffix('pianra')
True
"""
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "files/lemmatizer/shipibo_suffixes.dat")
suffixes = codecs.open(path, "r", "utf-8")
lines = suffixes.read().splitlines()
lines = tuple(lines)
if str.endswith(lines):
return True
else:
return False
[docs]def shipibo_suffixes():
""" Function that returns a list with all the shipibo suffixes
:returns: list with all the suffixes
:rtype: list
:Example:
>>> import chana.lemmatizer
>>> chana.lemmatizer.shipibo_suffixes()
['naan', 'yama', 'men', 'iosma', ..., 'shoko']
"""
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "files/lemmatizer/shipibo_suffixes.dat")
suffixes = codecs.open(path, "r", "utf-8")
shipibo_suffixes = suffixes.read().splitlines()
return(shipibo_suffixes)
[docs]class ShipiboLemmatizer:
"""
Instance of the pre-trained shipibo lemmatizer
"""
def __init__(self):
"""
Constructor of the class that loads the pretrained model
"""
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "files/lemmatizer/shipibo_knn_model.pkl")
self.lemmatizer = joblib.load(path)
self.features_length = 18
[docs] def preprocess_word(self, word):
""" Method that turns a word in an array of features for the classifier
:param word: a word to be transformed
:type word: str
:returns: list with the features
:rtype: list
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer()
>>> lemmatizer.preprocess_word('shipibobo')
[111, 98, 111, 98, 105, 112, 105, 104, 115, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"""
features = [0 for x in range(self.features_length)]
i = 0
pal = reversed(list(word))
for letter in pal:
features[i]=ord(letter)
i+=1
return features
[docs] def get_lemma(self, rule, word):
""" Method that returns the lemma of a shipibo word given a possible rule
:param rule: a rule to transform a word
:type rule: list
:param word: a word to be transformed
:type word: str
:returns: word transformed
:rtype: str
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer()
>>> lemmatizer.get_lemma(['bo>'],'shipibobo')
'shipibo'
"""
rule = (rule[0].split('>'))
substract = rule[0]
add = rule[1]
if substract=='':
return word+add
elif word.endswith(substract):
lemma = replace_last(word,substract,add)
return lemma
else:
return word
[docs] def get_rule(self, word):
""" Method that returns the transformation rule for a shipibo word
:param word: a word to get the rule
:type word: str
:returns: numpy array with the rule
:rtype: array
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer()
>>> lemmatizer.get_rule('pikanwe')
array(['anwe>i'], dtype='<U16')
"""
lemma_num = self.preprocess_word(word)
lemma_num = np.array(lemma_num).reshape(1 ,-1)
rule = self.lemmatizer.predict(lemma_num)
return rule
[docs] def lemmatize(self, word):
""" Method that predicts the lemma of a shipibo word
:param word: a word to get the lemma
:type word: str
:returns: lemma of the word
:rtype: str
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.ShipiboLemmatizer()
>>> lemmatizer.lemmatize('pikanwe')
'piki'
"""
if has_shipibo_suffix(word):
rule = self.get_rule(word)
lemma = self.get_lemma(rule, word)
return lemma
else:
return word
[docs]class GeneralLemmatizer:
"""
Instance of a new lemmatizer to be trained and used
"""
def __init__(self, features_length = 10, n_neighbors = 5):
"""
Constructor of the class with the number of features to be used by the lemmatizer
:param features_length: number of features to be used
:type features_length: int
:param n_neighbors: number of neighbors to be used
:type n_neighbors: int
"""
self.features_length = features_length
self.n_neighbors = n_neighbors
self.lemmatizer = None
[docs] def train(self, words, lemmas):
""" Method that trains a new lemmatizer with a list of words and a list of lemmas of the same size
:param words: list of words
:type words: list
:param lemmas: list of lemmas
:type lemmas: list
:returns: none
:rtype: None
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.GeneralLemmatizer()
>>> lemmas = ['perro','gato','mono']
>>> words = ['perritos','gatitos','monotes']
>>> lemmatizer.train(words,lemmas)
"""
if len(words) != len(lemmas):
return 'Both arrays must be of the same size'
if len(words) < self.n_neighbors:
return 'The number of words to train must be greater than the number of neighbors to predict'
array_clases=[]
array_features = [[0 for x in range(self.features_length)] for y in range(len(words))]
iterator = 0
for word, lemma in zip(words, lemmas):
sub_string = longest_common_substring(word,lemma)
left = word.replace(sub_string, "")
right = lemma.replace(sub_string, "")
array_clases.append(left+">"+right)
word = reversed(word)
let = 0
for letter in word:
if(let<self.features_length):
array_features[iterator][let] = ord(letter)
let += 1
iterator += 1
model = neighbors.KNeighborsClassifier(n_neighbors=self.n_neighbors, metric='hamming')
model.fit(array_features, array_clases)
self.lemmatizer = model
[docs] def preprocess_word(self,word):
""" Method that turns a word in an array of features for the classifier according to its features_length
:param word: a word to be transformed
:type word: str
:returns: list with the features
:rtype: list
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.GeneralLemmatizer()
>>> lemmatizer.preprocess_word('perritos')
[115, 111, 116, 105, 114, 114, 101, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"""
features = [0 for x in range(self.features_length)]
i = 0
pal = reversed(list(word))
for letter in pal:
features[i]=ord(letter)
i+=1
return features
[docs] def get_lemma(self, rule, word):
""" Method that returns the lemma of a word given a possible rule
:param rule: a rule to transform a word
:type rule: list
:param word: a word to be transformed
:type word: str
:returns: word transformed
:rtype: str
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.GeneralLemmatizer()
>>> lemmatizer.get_lemma(['bo>'],'shipibobo')
'shipibo'
"""
rule = (rule[0].split('>'))
substract = rule[0]
add = rule[1]
if substract=='':
return word+add
elif word.endswith(substract):
lemma = replace_last(word,substract,add)
return lemma
else:
return word
[docs] def get_rule(self, word):
""" Method that returns the transformation rule for a word
:param word: a word to get the rule
:type word: str
:returns: numpy array with the rule
:rtype: array
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.GeneralLemmatizer()
>>> lemmatizer.get_rule('perrito')
array(['ito>0'], dtype='<U16')
"""
lemma_num = self.preprocess_word(word)
lemma_num = np.array(lemma_num).reshape(1 ,-1)
rule = self.lemmatizer.predict(lemma_num)
return rule
[docs] def lemmatize(self, word):
""" Method that predicts the lemma of a word with the trained model
:param word: a word to get the lemma
:type word: str
:returns: lemma of the word
:rtype: str
:Example:
>>> import chana.lemmatizer
>>> lemmatizer = chana.lemmatizer.GeneralLemmatizer()
>>> lemmatizer.lemmatize('perrito')
'perro'
"""
if self.lemmatizer == None:
return 'The lemmatizer must be trained first'
rule = self.get_rule(word)
lemma = self.get_lemma(rule, word)
return lemma