I adapted this code from the examples/encoding_examples.py file to use a pipeline. cross_val_score fail with the error at the end of this post
import pandas as pd
import numpy as np
from sklearn import cross_validation, linear_model, model_selection
import category_encoders
from examples.source_data.loaders import get_mushroom_data, get_cars_data, get_splice_data
from sklearn.pipeline import make_pipeline
X, y, mapping = get_mushroom_data()
t = category_encoders.BinaryEncoder(handle_unknown = "ignore")
mypipeline = make_pipeline(t, linear_model.LogisticRegression())
cross_validation.cross_val_score(mypipeline, X, y, n_jobs=1, cv=5)
Abridged List of packages installed
- numpy 1.14.0 py36h4a99626_1
- pandas 0.22.0 py36h6538335_0
- python 3.6.4 h6538335_1
- scikit-learn 0.19.1 py36h53aea1b_0
ERROR
ValueError Traceback (most recent call last)
in ()
10 mypipeline = make_pipeline(t, linear_model.LogisticRegression())
11
---> 12 cross_validation.cross_val_score(mypipeline, X, y, n_jobs=1, cv=5)
13
14
C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1579 train, test, verbose, None,
1580 fit_params)
-> 1581 for train, test in cv)
1582 return np.array(scores)[:, 0]
1583
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in call(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib_parallel_backends.py in init(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in call(self)
129
130 def call(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def len(self):
C:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in (.0)
129
130 def call(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def len(self):
C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1692
1693 else:
-> 1694 test_score = _score(estimator, X_test, y_test, scorer)
1695 if return_train_score:
1696 train_score = _score(estimator, X_train, y_train, scorer)
C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _score(estimator, X_test, y_test, scorer)
1749 score = scorer(estimator, X_test)
1750 else:
-> 1751 score = scorer(estimator, X_test, y_test)
1752 if hasattr(score, 'item'):
1753 try:
C:\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
242 def _passthrough_scorer(estimator, *args, **kwargs):
243 """Function that wraps estimator.score"""
--> 244 return estimator.score(*args, **kwargs)
245
246
C:\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in (*args, **kwargs)
113
114 # lambda, but not partial, allows help() to work with update_wrapper
--> 115 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
116 # update the docstring of the returned function
117 update_wrapper(out, self.fn)
C:\Anaconda3\lib\site-packages\sklearn\pipeline.py in score(self, X, y, sample_weight)
484 for name, transform in self.steps[:-1]:
485 if transform is not None:
--> 486 Xt = transform.transform(Xt)
487 score_params = {}
488 if sample_weight is not None:
C:\Anaconda3\lib\site-packages\category_encoders\binary.py in transform(self, X)
163 X = self.ordinal_encoder.transform(X)
164
--> 165 X = self.binary(X, cols=self.cols)
166
167 if self.drop_invariant:
C:\Anaconda3\lib\site-packages\category_encoders\binary.py in binary(self, X_in, cols)
248
249 # map the ordinal column into a list of these digits, of length digits
--> 250 X[col] = X[col].map(lambda x: self.col_transform(x, digits))
251
252 for dig in range(digits):
C:\Anaconda3\lib\site-packages\pandas\core\series.py in map(self, arg, na_action)
2352 else:
2353 # arg is a function
-> 2354 new_values = map_f(values, arg)
2355
2356 return self._constructor(new_values,
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
C:\Anaconda3\lib\site-packages\category_encoders\binary.py in (x)
248
249 # map the ordinal column into a list of these digits, of length digits
--> 250 X[col] = X[col].map(lambda x: self.col_transform(x, digits))
251
252 for dig in range(digits):
C:\Anaconda3\lib\site-packages\category_encoders\binary.py in col_transform(col, digits)
309 else:
310
--> 311 col = list("{0:b}".format(int(col)))
312 if len(col) == digits:
313 return col
ValueError: cannot convert float NaN to integer