FeatureUnion personalizado no funcionará?

Estoy intentando modificar este ejemplo para usar un dataframe de Pandas en lugar de los conjuntos de datos de prueba. No puedo hacerlo, ya que ItemSelector no parece reconocer el nombre de la columna.

Tenga en cuenta que las columnas del dataframe df_resolved.columns devuelven:

 Index(['u_category', ... ... 'resolution_time', 'rawtext'], dtype='object') 

Así que obviamente tengo esto en mi dataframe.

Sin embargo, cuando bash ejecutar la solución, aparece el error.

“ValueError: ningún campo de nombre u_category”

Además, parece que no puedo modificar el código para admitir la selección de varias columnas en el ItemSelector , por lo que en esta solución, tendría que aplicar los transformadores por separado con cada columna.

Mi código es:

 import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.datasets import fetch_20newsgroups from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report from sklearn.pipeline import FeatureUnion from sklearn.pipeline import Pipeline from sklearn.svm import SVC class ItemSelector(BaseEstimator, TransformerMixin): def __init__(self, key): self.key = key def fit(self, x, y=None): return self def transform(self, data_dict): return data_dict[self.key] class TextStats(BaseEstimator, TransformerMixin): """Extract features from each document for DictVectorizer""" def fit(self, x, y=None): return self def transform(self, posts): return [{'length': len(text), 'num_sentences': text.count('.')} for text in posts] class SubjectBodyExtractor(BaseEstimator, TransformerMixin): """Extract the subject & body from a usenet post in a single pass. Takes a sequence of strings and produces a dict of sequences. Keys are `subject` and `body`. """ def fit(self, x, y=None): return self def transform(self, posts): features = np.recarray(shape=(len(posts),), dtype=[('subject', object), ('body', object)]) for i, text in enumerate(posts): headers, _, bod = text.partition('\n\n') bod = strip_newsgroup_footer(bod) bod = strip_newsgroup_quoting(bod) features['body'][i] = bod prefix = 'Subject:' sub = '' for line in headers.split('\n'): if line.startswith(prefix): sub = line[len(prefix):] break features['subject'][i] = sub return features pipeline = Pipeline([ # Extract the subject & body ('subjectbody', SubjectBodyExtractor()), # Use FeatureUnion to combine the features from subject and body ('union', FeatureUnion( transformer_list=[ # Pipeline for pulling features from the post's subject line ('rawtext', Pipeline([ ('selector', ItemSelector(key='u_category')), ('labelenc', preprocessing.LabelEncoder()), ])), # Pipeline for standard bag-of-words model for body ('features', Pipeline([ ('selector', ItemSelector(key='rawtext')), ('tfidf', TfidfVectorizer(max_df=0.5, min_df=1, stop_words='english', token_pattern=u'(?ui)\\b\\w*[az]{2,}\\w*\\b')), ])), ], # weight components in FeatureUnion transformer_weights={ 'rawtext': 1.0, 'features': 1.0, }, )), # Use a SVC classifier on the combined features ('linear_svc', LinearSVC(penalty="l2")), ]) # limit the list of categories to make running this example faster. X_train, X_test, y_train, y_test = train_test_split(df_resolved.ix[:, (df_resolved.columns != 'assignment_group.name')], df_resolved['assignment_group.name'], test_size=0.2, random_state=42) pipeline.fit(X_train, y_train) print(pipeline.score(X_test, y_test)) 

¿Cómo puedo modificar este código para que funcione correctamente con mi dataframe y posiblemente admitir la aplicación de un transformador a varias columnas a la vez?

Si saco el ItemSelector , parece funcionar. Así que esto funciona:

 ds = ItemSelector(key='u_category') ds.fit(df_resolved) labelenc = preprocessing.LabelEncoder() labelenc_transformed = labelenc.fit_transform(ds.transform(df_resolved)) 

Rastro de la stack completa:

 --------------------------------------------------------------------------- ValueError Traceback (most recent call last)  in () 136 137 --> 138 pipeline.fit(X_train, y_train) 139 #y = pipeline.predict(X_test) 140 #print(classification_report(y, test.target)) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 266 This estimator 267 """ --> 268 Xt, fit_params = self._fit(X, y, **fit_params) 269 if self._final_estimator is not None: 270 self._final_estimator.fit(Xt, y, **fit_params) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 232 pass 233 elif hasattr(transform, "fit_transform"): --> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 235 else: 236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 732 delayed(_fit_transform_one)(trans, name, weight, X, y, 733 **fit_params) --> 734 for name, trans, weight in self._iter()) 735 736 if not result: /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 756 # was dispatched. In particular this covers the edge 757 # case of Parallel used with an exhausted iterator. --> 758 while self.dispatch_one_batch(iterator): 759 self._iterating = True 760 else: /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 606 return False 607 else: --> 608 self._dispatch(tasks) 609 return True 610 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 569 dispatch_timestamp = time.time() 570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 571 job = self._backend.apply_async(batch, callback=cb) 572 self._jobs.append(job) 573 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 107 def apply_async(self, func, callback=None): 108 """Schedule a func to be run""" --> 109 result = ImmediateResult(func) 110 if callback: 111 callback(result) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 324 # Don't delay the application, to avoid keeping the input 325 # arguments in memory --> 326 self.results = batch() 327 328 def get(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in (.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params) 575 **fit_params): 576 if hasattr(transformer, 'fit_transform'): --> 577 res = transformer.fit_transform(X, y, **fit_params) 578 else: 579 res = transformer.fit(X, y, **fit_params).transform(X) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 299 """ 300 last_step = self._final_estimator --> 301 Xt, fit_params = self._fit(X, y, **fit_params) 302 if hasattr(last_step, 'fit_transform'): 303 return last_step.fit_transform(Xt, y, **fit_params) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 232 pass 233 elif hasattr(transform, "fit_transform"): --> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 235 else: 236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 495 else: 496 # fit method of arity 2 (supervised transformation) --> 497 return self.fit(X, y, **fit_params).transform(X) 498 499  in transform(self, data_dict) 55 56 def transform(self, data_dict): ---> 57 return data_dict[self.key] 58 59 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/numpy/core/records.py in __getitem__(self, indx) 497 498 def __getitem__(self, indx): --> 499 obj = super(recarray, self).__getitem__(indx) 500 501 # copy behavior of getattr, except that here ValueError: no field of name u_category 

ACTUALIZAR:

Incluso si uso dataframes (NO train_test_split ), el problema persiste: introduzca la descripción de la imagen aquí

ACTUALIZACIÓN 2: OK, así que quité el SubjectBodyExtractor , ya que no lo necesitaré. Ahora el ValueError: no field of name u_category ha desaparecido, pero tengo un nuevo error: TypeError: fit_transform() takes 2 positional arguments but 3 were given .

Rastreo de stack:

 --------------------------------------------------------------------------- TypeError Traceback (most recent call last)  in () 129 130 --> 131 pipeline.fit(X_train.ix[:, (X_test.columns != 'assignment_group.name')], X_test['assignment_group.name']) 132 #y = pipeline.predict(X_test) 133 #print(classification_report(y, test.target)) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 266 This estimator 267 """ --> 268 Xt, fit_params = self._fit(X, y, **fit_params) 269 if self._final_estimator is not None: 270 self._final_estimator.fit(Xt, y, **fit_params) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 232 pass 233 elif hasattr(transform, "fit_transform"): --> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 235 else: 236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 732 delayed(_fit_transform_one)(trans, name, weight, X, y, 733 **fit_params) --> 734 for name, trans, weight in self._iter()) 735 736 if not result: /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 756 # was dispatched. In particular this covers the edge 757 # case of Parallel used with an exhausted iterator. --> 758 while self.dispatch_one_batch(iterator): 759 self._iterating = True 760 else: /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 606 return False 607 else: --> 608 self._dispatch(tasks) 609 return True 610 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 569 dispatch_timestamp = time.time() 570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 571 job = self._backend.apply_async(batch, callback=cb) 572 self._jobs.append(job) 573 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 107 def apply_async(self, func, callback=None): 108 """Schedule a func to be run""" --> 109 result = ImmediateResult(func) 110 if callback: 111 callback(result) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 324 # Don't delay the application, to avoid keeping the input 325 # arguments in memory --> 326 self.results = batch() 327 328 def get(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in (.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params) 575 **fit_params): 576 if hasattr(transformer, 'fit_transform'): --> 577 res = transformer.fit_transform(X, y, **fit_params) 578 else: 579 res = transformer.fit(X, y, **fit_params).transform(X) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 301 Xt, fit_params = self._fit(X, y, **fit_params) 302 if hasattr(last_step, 'fit_transform'): --> 303 return last_step.fit_transform(Xt, y, **fit_params) 304 elif last_step is None: 305 return Xt TypeError: fit_transform() takes 2 positional arguments but 3 were given 

Sí, eso es porque LabelEncoder solo requiere una única matriz y, mientras que FeatureUnion intentará enviar X e y ambos a ella.

Vea esto: https://github.com/scikit-learn/scikit-learn/issues/3956

Puedes usar una solución simple para esto:

Defina un LabelEncoder personalizado como este:

 class MyLabelEncoder(BaseEstimator, TransformerMixin): def __init__(self): self.le = LabelEncoder() def fit(self, x, y=None): return self.le.fit(x) def transform(self, x, y=None): return self.le.transform(x).reshape(-1,1) def fit_transform(self, x, y=None): self.fit(x) return self.transform(x) 

Y en la tubería, hacer esto:

 .... .... ('selector', ItemSelector(key='u_category')), ('labelenc', MyLabelEncoder()), 

Tenga en cuenta la remodelación (-1,1) en el método trasform() . Eso es porque FeatureUnion solo funciona con datos 2-d. Todos los transformadores individuales dentro de FeatureUnion solo deben devolver datos 2-d.

es posible que deba agregarlos en la matriz de características como esta, intente agregar los dos selectores en las características como esta y mostrarme los resultados

 features = np.recarray(shape=(len(posts),), dtype=[('u_category', object), ('rawtext', object)])