Source code for chana.ner

#coding=UTF-8
"""
Named-entity recognizer for shipibo-konibo
Source model is from the Chana project and use predefined rules for the language as well as a crf from pycrfsuite
"""
import codecs
import collections
import re
import os
import string
import numpy as np
import pycrfsuite


[docs]def load_array(file,array): """ Inner function that loads the information of a file into a list :param file: a file to be loaded :type file: File :param array: a list to be populated with the information from the file :type array: list :returns: none :rtype: None """ my_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(my_path, file) f = codecs.open(path, "r", encoding= "utf-8") f_read = f.read() lines = f_read.splitlines() for word in lines: first_letter = word[0] array[first_letter].append(word) f.close() for key, elem in array.items(): array[key]='|'.join(elem)
[docs]def is_number(word): """ Function that returns 'NUM' if a shipo word is a number or False if not :param word: a word to be evaluated :type word: str :returns: 'NUM' if a shipo word is a number or False if not :rtype: str :Example: >>> import chana.ner >>> chana.ner.is_number('kimisha') 'NUM' """ numbers=['westiora','rabé','kimisha','chosko','pichika','sokota','kanchis','posaka','iskon','chonka','pacha','waranka'] if word.lower() in numbers: return 'NUM' else: return False
[docs]def is_location(word): """ Function that returns 'LOC' if a shipo word is a location or False if not :param word: a word to be evaluated :type word: str :returns: 'LOC' if a shipo word is a location or False if not :rtype: str :Example: >>> import chana.ner >>> chana.is_location.is_name('Limanko') 'LOC' """ pattern = re.compile('ain|nko|ainko|mea|meax|nkonia|nkoniax|kea|keax|ainoa|ainoax|oa|oax') letters = string.ascii_uppercase + 'Ñ' locations = dict.fromkeys(letters, []) load_array('files/ner/loc_esp_s.dat', locations) if word.istitle(): first_letter = word[0] if pattern.search(word): return 'LOC' elif re.search('[ÑA-Z]', first_letter)!=None and re.compile(locations[first_letter]).search(word): return 'LOC' else: return False
[docs]def is_name(word): """ Function that returns 'PER' if a shipo word is a proper name/person or False if not :param word: a word to be evaluated :type word: str :returns: 'PER' if a shipo word is a proper name/person or False if not :rtype: str :Example: >>> import chana.ner >>> chana.ner.is_name('Adriano') 'PER' """ letters = string.ascii_uppercase + 'Ñ' names = dict.fromkeys(letters, []) load_array('files/ner/per_esp_s.dat', names) if word.title(): first_letter=word[0] if re.search('[ÑA-Z]', first_letter)!=None and re.compile(names[first_letter]).search(word): return 'PER' else: return False
[docs]def is_organization(word): """ Function that returns 'ORG' if a shipo word is an organization or False if not :param word: a word to be evaluated :type word: str :returns: 'ORG' if a shipo word is an organization or False if not :rtype: str :Example: >>> import chana.ner >>> chana.ner.is_organization('AUT') 'ORG' """ letters = string.ascii_uppercase + 'Ñ' organizations = dict.fromkeys(letters, []) load_array('files/ner/org_esp_s.dat', organizations) if word.title(): first_letter=word[0] if re.search('[ÑA-Z]', first_letter)!=None and re.compile(organizations[first_letter]).search(word): return 'ORG' else: return False
[docs]def is_date(word): """ Function that returns 'FEC' if a shipo word is a date or False if not :param word: a word to be evaluated :type word: str :returns: 'FEC' if a shipo word is a date or False if not :rtype: str :Example: >>> import chana.ner >>> chana.ner.is_date('Agosto') 'FEC' """ months=['enero','febrero','marzo','abril','mayo','junio','julio','agosto','setiembre','octubre','noviembre','diciembre'] if word.lower() in months: return 'FEC'
[docs]class ShipiboNER: """ Instance of the rule based NER for shipibo """ def __init__(self): """ Constructor of the class that loads the crf model and the information files """ self.letters = string.ascii_uppercase + 'Ñ' self.names = dict.fromkeys(self.letters, []) self.locations = dict.fromkeys(self.letters, []) self.organizations = dict.fromkeys(self.letters, []) self.tagger = pycrfsuite.Tagger() my_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(my_path, 'files/ner/crf_ner.crfsuite') self.tagger.open(path) load_array('files/ner/per_esp_s.dat',self.names) load_array('files/ner/loc_esp_s.dat',self.locations) load_array('files/ner/org_esp_s.dat',self.organizations)
[docs] def check_locations(self,words,entity_tag): """ Inner method that tags the locations of a sentence with 'LOC' :param words: a list of words to be evaluated :type words: list :param entity_tag: a list of words to be evaluated :type entity_tag: list :returns: none :rtype: None """ pattern = re.compile('ain|nko|ainko|mea|meax|nkonia|nkoniax|kea|keax|ainoa|ainoax|oa|oax') idWord=0 last_Loc=-1 for word in words: if word.istitle(): first_letter=word[0] if pattern.search(word): entity_tag[idWord]='LOC' last_Loc=idWord elif re.search('[ÑA-Z]', first_letter)!=None and re.compile(self.locations[first_letter]).search(word): entity_tag[idWord]='LOC' last_Loc=idWord idWord+=1
[docs] def check_names(self,words,entity_tag): """ Inner method that tags the names/persons of a sentence with 'PER' :param words: a list of words to be evaluated :type words: list :param entity_tag: a list of words to be evaluated :type entity_tag: list :returns: none :rtype: None """ idWord=0 last_per=-1 for word in words: if word.title(): first_letter=word[0] if re.search('[ÑA-Z]', first_letter)!=None and re.compile(self.names[first_letter]).search(word): entity_tag[idWord]='PER' last_per=idWord idWord+=1
[docs] def check_organizations(self,words,entity_tag): """ Inner method that tags the organizations of a sentence with 'ORG' :param words: a list of words to be evaluated :type words: list :param entity_tag: a list of words to be evaluated :type entity_tag: list :returns: none :rtype: None """ idWord=0 last_org=-1 for word in words: if word.title(): first_letter=word[0] if re.search('[ÑA-Z]', first_letter)!=None and re.compile(self.organizations[first_letter]).search(word): entity_tag[idWord]='ORG' last_org=idWord idWord+=1
[docs] def check_numbers(self,words,entity_tag): """ Inner method that tags the numbers of a sentence with 'NUM' :param words: a list of words to be evaluated :type words: list :param entity_tag: a list of words to be evaluated :type entity_tag: list :returns: none :rtype: None """ numbers=['westiora','rabé','kimisha','chosko','pichika','sokota','kanchis','posaka','iskon','chonka','pacha','waranka'] idWord=0 for word in words: if word.lower() in numbers: entity_tag[idWord]='NUM' idWord+=1
[docs] def check_dates(self,words,entity_tag): """ Inner method that tags the dates of a sentence with 'FEC' :param words: a list of words to be evaluated :type words: list :param entity_tag: a list of words to be evaluated :type entity_tag: list :returns: none :rtype: None """ months=['enero','febrero','marzo','abril','mayo','junio','julio','agosto','setiembre','octubre','noviembre','diciembre'] idWord=0 last_date=-1 for word in words: if word.lower() in months: entity_tag[idWord]='FEC' last_date=idWord if idWord > 0: pre=words[idWord-1] if pre.isdigit(): entity_tag[idWord-1]='FEC' if idWord<len(words)-1: pos=words[idWord+1] if pos.isdigit(): entity_tag[idWord+1]='FEC' idWord+=1
[docs] def rule_tag(self, sentence): """ Method that tags a sentence with the rule based system :param sentence: a sentence to be evaluated :type sentence: str :returns: list with the ner tags :rtype: list :Example: >>> import chana.ner >>> ner = chana.ner.ShipiboNer() >>> ner.rule_tag('Limanko enra atsawe') ['LOC', 'O', 'O'] """ words=sentence.split() entity_tag=[] for x in range(len(words)): entity_tag.append('O') self.check_names(words,entity_tag) self.check_organizations(words,entity_tag) self.check_locations(words,entity_tag) self.check_numbers(words,entity_tag) self.check_dates(words,entity_tag) return entity_tag
[docs] def word2features(self,sent, i): """ Inner method that add features to the words of a sentence to be tagged by the crf model :param sent: a sentence in list form to be transformed into features :type sent: list :param i: index of the word to be evaluated :type i: int :returns: list with the features for the indexed word :rtype: list """ word = sent[i][0] tagBR = sent[i][1] features = [ 'bias', 'word.lower=' + word.lower(), 'word[-3:]=' + word[-3:], 'word[-2:]=' + word[-2:], 'word.isupper=%s' % word.isupper(), 'word.istitle=%s' % word.istitle(), 'word.isdigit=%s' % word.isdigit(), 'tagBR=' + tagBR, 'tagBR[:2]=' + tagBR[:2], ] if i > 0: word1 = sent[i-1][0] tagBR1 = sent[i-1][1] features.extend([ '-1:word.lower=' + word1.lower(), '-1:word.istitle=%s' % word1.istitle(), '-1:word.isupper=%s' % word1.isupper(), '-1:tagBR=' + tagBR1, '-1:tagBR[:2]=' + tagBR1[:2], ]) else: features.append('BOS') if i < len(sent)-1: word1 = sent[i+1][0] tagBR1 = sent[i+1][1] features.extend([ '+1:word.lower=' + word1.lower(), '+1:word.istitle=%s' % word1.istitle(), '+1:word.isupper=%s' % word1.isupper(), '+1:tagBR=' + tagBR1, '+1:tagBR[:2]=' + tagBR1[:2], ]) else: features.append('EOS') return features
[docs] def sent2features(self,sent): """ Inner method that add features to a sentence to be tagged by the crf model :param sent: a sentence in list form to be transformed into features :type sent: list :returns: list with features :rtype: list """ return [self.word2features(sent, i) for i in range(len(sent))]
[docs] def crf_tag(self,sentence): """ Method that tags a sentence with the rule based method and then with the crf model :param sentence: a sentence to be evaluated :type sentence: str :returns: list with the ner tags :rtype: list :Example: >>> import chana.ner >>> ner = chana.ner.ShipiboNer() >>> ner.crf_tag('Limanko enra atsawe') ['LOC', 'O', 'O'] """ entity_tag_R=self.rule_tag(sentence) vectorWord=[] words=sentence.split() idWord=0 for word in words: tag_r=entity_tag_R[idWord] result_tag=(word,tag_r) vectorWord.append(result_tag) idWord+=1 entity_tag=self.tagger.tag(self.sent2features(vectorWord)) return entity_tag