Stanford NER en NLTK no etiqueta correctamente varias oraciones – Python

Tengo una función que devuelve las entidades nombradas en un cuerpo de texto dado, utilizando la NER de Stanford.

def get_named_entities(text): load_ner_files() print text[:100] # to show that the text is fine text_split = text.split() print text_split # to show the split is working fine result = "named entities = ", st.tag(text_split) return result 

Estoy cargando el texto de una url usando el paquete del periódico Python.

 def get_page_text(): url = "https://aeon.co/essays/elon-musk-puts-his-case-for-a-multi-planet-civilisation" page = Article(url) page.download() page.parse() return unicodedata.normalize('NFKD', page.text).encode('ascii', 'ignore') 

Sin embargo, cuando ejecuto la función obtengo el siguiente resultado:

 ['Fuck', 'Earth!', 'Elon', 'Musk', 'said', 'to', 'me,', 'laughing.', 'Who', 'cares', 'about', 'Earth?'......... (continued) named entities = [('Fuck', 'O'), ('Earth', 'O'), ('!', 'O')] 

Así que mi pregunta es, ¿por qué solo se etiquetan las tres primeras palabras?

Suponiendo que uno haya configurado el NLTK v3.2 correctamente, vea

TL; DR :

 pip install -U nltk 

o

 conda update nltk 

Después de configurar NLTK y Stanford Tools (recuerde configurar las variables de entorno):

 import time import urllib.request from itertools import chain from bs4 import BeautifulSoup from nltk import word_tokenize, sent_tokenize from nltk.tag import StanfordNERTagger class Article: def __init__(self, url, encoding='utf8'): self.url = url self.encoding='utf8' self.text = self.fetch_url_text() self.process_text() def fetch_url_text(self): response = urllib.request.urlopen(self.url) self.data = response.read().decode(self.encoding) self.bsoup = BeautifulSoup(self.data, 'html.parser') return '\n'.join([paragraph.text for paragraph in self.bsoup.find_all('p')]) def process_text(self): self.paragraphs = [sent_tokenize(p.strip()) for p in self.text.split('\n') if p] _sents = list(chain(*self.paragraphs)) self.sents = [word_tokenize(sent) for sent in _sents] self.words = list(chain(*self.sents)) url = 'https://aeon.co/essays/elon-musk-puts-his-case-for-a-multi-planet-civilisation' a1 = Article(url) three_sentences = a1.sents[20:23] st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # Tag multiple sentences at one go. start = time.time() tagged_sents = st.tag_sents(three_sentences) print ("Tagging took:", time.time() - start) print (tagged_sents, end="\n\n") for sent in tagged_sents: print (sent) print() # (Much slower) Tagging sentences one at the time and # Stanford NER is refired every time. start = time.time() tagged_sents = [st.tag(sent) for sent in three_sentences] print ("Tagging took:", time.time() - start) for sent in tagged_sents: print (sent) print() 

[afuera]:

 Tagging took: 2.537247657775879 [[('Musk', 'PERSON'), ('was', 'O'), ('laughing', 'O'), ('because', 'O'), ('he', 'O'), ('was', 'O'), ('joking', 'O'), (':', 'O'), ('he', 'O'), ('cares', 'O'), ('a', 'O'), ('great', 'O'), ('deal', 'O'), ('about', 'O'), ('Earth', 'LOCATION'), ('.', 'O')], [('When', 'O'), ('he', 'O'), ('is', 'O'), ('not', 'O'), ('here', 'O'), ('at', 'O'), ('SpaceX', 'ORGANIZATION'), (',', 'O'), ('he', 'O'), ('is', 'O'), ('running', 'O'), ('an', 'O'), ('electric', 'O'), ('car', 'O'), ('company', 'O'), ('.', 'O')], [('But', 'O'), ('this', 'O'), ('is', 'O'), ('his', 'O'), ('manner', 'O'), ('.', 'O')]] [('Musk', 'PERSON'), ('was', 'O'), ('laughing', 'O'), ('because', 'O'), ('he', 'O'), ('was', 'O'), ('joking', 'O'), (':', 'O'), ('he', 'O'), ('cares', 'O'), ('a', 'O'), ('great', 'O'), ('deal', 'O'), ('about', 'O'), ('Earth', 'LOCATION'), ('.', 'O')] [('When', 'O'), ('he', 'O'), ('is', 'O'), ('not', 'O'), ('here', 'O'), ('at', 'O'), ('SpaceX', 'ORGANIZATION'), (',', 'O'), ('he', 'O'), ('is', 'O'), ('running', 'O'), ('an', 'O'), ('electric', 'O'), ('car', 'O'), ('company', 'O'), ('.', 'O')] [('But', 'O'), ('this', 'O'), ('is', 'O'), ('his', 'O'), ('manner', 'O'), ('.', 'O')] Tagging took: 7.375355243682861 [('Musk', 'PERSON'), ('was', 'O'), ('laughing', 'O'), ('because', 'O'), ('he', 'O'), ('was', 'O'), ('joking', 'O'), (':', 'O'), ('he', 'O'), ('cares', 'O'), ('a', 'O'), ('great', 'O'), ('deal', 'O'), ('about', 'O'), ('Earth', 'LOCATION'), ('.', 'O')] [('When', 'O'), ('he', 'O'), ('is', 'O'), ('not', 'O'), ('here', 'O'), ('at', 'O'), ('SpaceX', 'ORGANIZATION'), (',', 'O'), ('he', 'O'), ('is', 'O'), ('running', 'O'), ('an', 'O'), ('electric', 'O'), ('car', 'O'), ('company', 'O'), ('.', 'O')] [('But', 'O'), ('this', 'O'), ('is', 'O'), ('his', 'O'), ('manner', 'O'), ('.', 'O')] 

Resulta que la NER solo funciona en una sola oración a la vez, por lo que solo se etiquetará hasta el final de lo que percibe como la primera oración.