Source code for chana.pos_tagger

#coding=UTF-8
"""
Part-of-Speech (POS) Tagger for shipibo-konibo.
Source model is from the Chana project
"""
import os
from sklearn.externals import joblib
import warnings

warnings.filterwarnings("ignore")


[docs]class ShipiboPosTagger: """ Instance of the pre-trained shipibo part-of-speech tagger """ def __init__(self): """ Constructor of the ShipiboPosTagger class that loads the pretrained model """ my_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(my_path, "files/pos_tagger/shipibo_svm_model.pkl") self.postagger = joblib.load(path)
[docs] def features(self, sentence, tags, index): """ Method that returns the features of a word in a sentence to be used by the model :param sentence: a sentence in shipibo-konibo :type sentence: str :param tags: tags to be returned for the word :type tags: list :param index: position of the word in the sentence :type index: int :returns: dict of features for the indexed word :rtype: dict :Example: >>> import chana.pos_tagger >>> tagger = chana.pos_tagger.ShipiboPosTagger() >>> tagger.features('Atsa ea piai',['','',''],2) {'word': 's', 'prevWord': 't', 'nextWord': 'a', 'isFirst': False, 'isLast': False, 'isCapitalized': False, 'isAllCaps': False, 'isAllLowers': True, 'prefix-1': 's', 'prefix-2': 's', 'prefix-3': 's', 'prefix-4': 's', 'suffix-1': 's', 'suffix-2': 's', 'suffix-3': 's', 'suffix-4': 's', 'tag-1': '', 'tag-2': ''} """ return{ 'word': sentence[ index ], 'prevWord': '' if index == 0 else sentence[ index - 1 ], 'nextWord': '' if index == len( sentence ) -1 else sentence[ index + 1 ], 'isFirst': index == 0, 'isLast': index == len( sentence ) - 1, 'isCapitalized': sentence[index][0].upper() == sentence[ index ][ 0], 'isAllCaps': sentence[ index ].upper() == sentence[ index ], 'isAllLowers': sentence[ index ].lower() == sentence[ index ], 'prefix-1': sentence[ index ][ 0 ], 'prefix-2': '' if ( len(sentence) < 2 ) else sentence[ index ][:2], 'prefix-3': '' if ( len(sentence) < 3 ) else sentence[ index ][:3], 'prefix-4': '' if ( len(sentence) < 4 ) else sentence[ index ][:4], 'suffix-1': sentence[ index ][ -1 ], 'suffix-2': '' if ( len(sentence) < 2 ) else sentence[ index ][-2:], 'suffix-3': '' if ( len(sentence) < 3 ) else sentence[ index ][-3:], 'suffix-4': '' if ( len(sentence) < 4 ) else sentence[ index ][-4:], 'tag-1': '' if index == 0 else tags[ index - 1 ], 'tag-2': '' if index < 2 else tags[ index - 2 ] }
[docs] def pos_tag(self, sentence): """ Method that predict the pos-tags of a shipibo sentence in the UD format :param sentence: a sentence in shipibo-konibo :type sentence: str :returns: list of the tags in UD format :rtype: list :Example: >>> import chana.pos_tagger >>> tagger = chana.pos_tagger.ShipiboPosTagger() >>> tagger.pos_tag('Atsa ea piai') ['NOUN', 'PRON', 'VERB'] """ tags = [] tokens = sentence.split(" ") for i in range(len(tokens)): tags.append('') for i in range (len(tokens)): feat = [] feat.append(self.features(tokens,tags,i)) tag_predicted = self.postagger.predict(feat)[0] tags[i] = tag_predicted return tags
[docs] def full_pos_tag(self, sentence): """ Method that predict the pos-tags of a shipibo sentence and returns the full tag in spanish :param sentence: a sentence in shipibo-konibo :type sentence: str :returns: list of the tags in spanish :rtype: list :Example: >>> import chana.pos_tagger >>> tagger = chana.pos_tagger.ShipiboPosTagger() >>> tagger.full_pos_tag('Atsa ea piai') ['Nombre', 'Pronombre', 'Verbo'] """ tags = self.pos_tag(sentence) for i in range(len( tags)): tags[i] = self.get_complete_tag(tags[i]) return tags
[docs] def get_complete_tag(self,pos): """ Method that returns the full tag in spanish of a tag :param pos: a pos tag in the UD format :type pos: str :returns: str with the tag in spanish :rtype: str :Example: >>> import chana.pos_tagger >>> tagger = chana.pos_tagger.ShipiboPosTagger() >>> tagger.get_complete_tag('ADJ') 'Adjetivo' """ if pos == "ADJ": return "Adjetivo" elif pos == "ADV" : return "Adverbio" elif pos == "CONJ" : return "Conjunción" elif pos == "DET" : return "Determinante" elif pos == "INTJ" : return "Interjección" elif pos == "NOUN" : return "Nombre" elif pos == "PROPN" : return "Nombre Propio" elif pos == "NUM" : return "Numeral" elif pos == "ONM" : return "Onomatopeya" elif pos == "INTW" : return "Palabra Interrogativa" elif pos == "ADP" : return "Postposición" elif pos == "PRON" : return "Pronombre" elif pos == "PUNCT" : return "Puntuación" elif pos == "SYM" : return "Símbolo" elif pos == "VERB": return "Verbo" elif pos == "AUX" : return "Verbo Auxiliar" return "Desconocido"