Comments (8)
The graph:
data_plot = DataFrames.groupby(data_shap, [:feature_name]);
baseline = round(data_shap.intercept[1], digits = 1)
using DataFramesMeta
regr=sort(@combine(data_plot,:x2 = sum(:shap_effect)),order(:2,rev=true));
using Gadfly # Plotting
p = Gadfly.plot(regr[1:10,:], y = :feature_name, x = :x2, Coord.cartesian(yflip = true),
Scale.y_discrete, Geom.bar(position = :dodge, orientation = :horizontal),
Theme(bar_spacing = 1mm),
Guide.xlabel("|Shapley effect| (baseline = $baseline)"), Guide.ylabel(nothing),
Guide.title("Feature Importance - Mean Absolute Shapley Value"))
from shapml.jl.
Hi I managed to do this using an AMLPipeline: could you help me with the graph of feature importance? I am stuck with the Dataframes.by which should be now groupby!
catf = CatFeatureSelector()
numf = NumFeatureSelector()
disc = CatNumDiscriminator()
pt = PrunedTree()
pca=SKPreprocessor("PCA");
pvote = disc |> ((catf |> ohe) + (numf|>pca)) |> pt
sample_size = 60 # Number of Monte Carlo samples.
AMLPipelineBase.fit!(pvote,X,y)
function predict_function(model, data)
data_pred = DataFrame(y_pred = AMLPipelineBase.transform(model, data))
return data_pred
end
data_shap = ShapML.shap(explain = X,
reference = X,
model = pvote,
predict_function = predict_function,
sample_size = sample_size,
seed = 1
)
show(data_shap, allcols = true)
from shapml.jl.
I'm sorry about the bug and delayed response, but I'm afraid that I cannot support this repo right now--even though I'd really like to. In the meantime, I've given access to a collaborator and will likely transfer ownership to JuliaAI or the Alan Turing Institute soon.
To your specific questions, ShapML is not set up to work with classification. It wouldn't be all that difficult to add, but it's not something that I could take on right now.
from shapml.jl.
No worry, I made it work with classification, see above example, and also made the graph (I will post it in a few hours). Thanks best regards
from shapml.jl.
@michele2198 would you mind posting full code for this? I am also trying to implement ShapML for multi class classification.
from shapml.jl.
Sure, here is all the code:
`using Pkg;
Pkg.add("XLSX")
using XLSX
import Base.Threads.@threads
#Pkg.add("MLDataPattern")
#Pkg.add("OutlierDetection")
#Pkg.add("OutlierDetectionData")
using MLDataPattern
#X_bal, y_bal = oversample((X, y));
#using OutlierDetection
#using OutlierDetectionData: ODDS
using Markdown
using InteractiveUtils
using Resample
import Pkg;
#Pkg.add("PrettyPrinting");Pkg.add("PyPlot");Pkg.add("MLJ");Pkg.add("CSV");Pkg.add("EvoTrees");Pkg.add("UrlDownload");Pkg.add("MLJFlux");Pkg.add("DataFrames");Pkg.add("OutlierDetectionNetworks");Pkg.add("GLMakie");Pkg.add("MLJModels");Pkg.add("MLJScikitLearnInterface");Pkg.add("MLJMultivariateStatsInterface");Pkg.add("MLJNaiveBayesInterface");Pkg.add("NearestNeighborModels");Pkg.add("LightGBM");Pkg.add("MLJGLMInterface");Pkg.add("MLJLIBSVMInterface");
#Pkg.add("MLJBase");Pkg.add("CategoricalArrays");Pkg.add("Plots");Pkg.add("DecisionTree");Pkg.add("MLJDecisionTreeInterface");Pkg.add("BetaML");Pkg.add("MLJLinearModels");Pkg.add("MLJXGBoostInterface")
#Pkg.add("LIBSVM")
#Pkg.add("ScikitLearn")
#Pkg.add("ShapML")
#Pkg.add("DataFramesMeta")
#using UrlDownload
using MLJMultivariateStatsInterface
using DataFrames
using MLJModels
#using PrettyPrinting
using MLJBase
#using PyPlot
using MLJ
using CSV
using CategoricalArrays
using Plots
using FeatureSelectors
using MLBase
using MLJScikitLearnInterface
using ShapML
using AMLPipelineBase
using DataFramesMeta
using AutoMLPipeline
using Random
#KNN = @load DecisionTreeClassifier pkg=BetaML;
#KNN = @load XGBoostClassifier pkg=XGBoost;
KNN = @load RandomForestClassifier pkg=DecisionTree;
#KNN = @load AdaBoostStumpClassifier pkg=DecisionTree;
#KNN = @load LinearBinaryClassifier pkg=GLM; fa schifo
#KNN = @load EvoTreeClassifier pkg=EvoTrees;
#KNN = @load KNNClassifier pkg=NearestNeighborModels
#KNN = @load PegasosClassifier pkg=BetaML
#KNN = @load NeuralNetworkClassifier pkg=MLJFlux
#KNN = @load KNNClassifier pkg=NearestNeighborModels
function firstselection(Xred2,y)
selector=MLJ.FeatureSelector();hot = MLJ.OneHotEncoder();stand=MLJ.Standardizer()
MyPipe=stand|>selector|>hot|>KNN
_names = schema(Xred2).names |> collect # make this a vector not tuple!
#_ncols = length(_names)
cases = [Symbol.(_names[1:i]) for i in 1:length(_names)]
# wrap in a tuning strategy:
tuned_pipe = TunedModel(; model=MyPipe,
range=range(MyPipe, :(feature_selector.features), values=cases),
measures=[BrierScore()],#cross_entropy, Accuracy(), TruePositiveRate(), TrueNegativeRate()
resampling=CV(nfolds=5),
acceleration=CPUThreads(),
repeats=10)
mach = tuned_pipe_machine = machine(tuned_pipe, Xred2, y) |> MLJ.fit!
return report(mach).best_model.feature_selector.features
end
function predict_function(model, data)
data_pred = DataFrame(y_pred = AMLPipelineBase.transform(model, data))
return data_pred
end
function featureselectShapley(Xred,y2)
pvote = ((catf |> ohe) + (numf)) |>rb|> pt
#sample_size = 10 # Number of Monte Carlo samples.
AMLPipelineBase.fit!(pvote,Xred,y2)
data_shap = ShapML.shap(explain = Xred,
reference = Xred,
model = pvote,
predict_function = predict_function,
sample_size = 100,
seed = 1);
#data_plot = DataFrames.groupby(data_shap, [:feature_name])
#baseline = round(data_shap.intercept[1], digits = 1)
regr=sort(@combine(DataFrames.groupby(data_shap, [:feature_name]),:x2 = sum(:shap_effect)),order(:2,rev=true))
return regr.feature_name[1:numfeatures]
end
numf = NumFeatureSelector();disc = CatNumDiscriminator();pt = PrunedTree()
ohe = AMLPipelineBase.OneHotEncoder();catf = CatFeatureSelector() ;stack = StackEnsemble();ada = Adaboost()
rb=skoperator("RobustScaler");jrf=RandomForest();vote = VoteEnsemble()
#https://juliaai.github.io/DataScienceTutorials.jl/end-to-end/breastcancer/
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data";
data1=CSV.read("Tripleneg.csv", DataFrame)
df1 = DataFrame(data1)[:, 2:end];
y2, X = unpack(df1, ==(:outcome),name->true, rng = 0); #y2 vettore numerico puro
coerce!(df1, :outcome => OrderedFactor{2}); #fondamentale!!
y, X = unpack(df1, ==(:outcome),name->true, rng = 0);
X=convert.(Float32,X)
y1=y #vettore "ordered"
y=categorical(string.(y),ordered = true,levels = ["0", "1"])
X1=X[:,1:2]
describe(X)
numfolds=10
numfeatures=20
numMontecarlos=1
model_names=Vector{String}()
package_names=Vector{String}()
loss_acc=Float64[];
loss_sens=Float64[];
loss_spec=Float64[];
start = time()
function modelCVeval(clf,X,y,y1,y2)
xx=length(y);
rp = randperm(xx);
Xshuffle=(X[rp,:]);yshuffle=y[rp];
rows = collect(StratifiedKfold(yshuffle, numfolds))
#collect(ScikitLearn.CrossValidation.StratifiedKFold(y, n_folds=5, shuffle=true, random_state=1))
local yCV=[];
local predCV=[];
for i=1:numfolds
#println("i = $i on thread $(Threads.threadid())")
local sss=Vector(1:size(yshuffle,1))
local train=rows[i]
local test=vec(sss[setdiff(1:size(sss,1), rows[i]), :])
#----------------UnivariateFeatureSelector-------------------
X_bal, y_bal = undersample((Matrix(Xshuffle[train,:])', yshuffle[train]));y1_bal=categorical(int.(Array(y_bal)).-1);y2_bal=(int.(Array(y_bal)).-1);
X_test=Xshuffle[test,:];X_bal=DataFrame(Matrix(X_bal'),names(Xshuffle));
#local Xred=X[:,select_features(UnivariateFeatureSelector(method=f_test, k=10),X[train,:],Array(y1[train]))];
local Xred=X_bal[:,select_features(UnivariateFeatureSelector(method=pearson_correlation, k=numfeatures),X_bal,Array(y1_bal))];
#SHAPLEY-----------------------------
#select!(Xred,featureselectShapley(Xred[train,:],y2[train]))
select!(Xred,featureselectShapley(Xred,y2_bal))
#°°°°°°°°°°°°°°°feature selection of top features------------------------
select!(Xred,firstselection(Xred,y_bal))
#select!(Xred,featureselectShapley(Xred[train,:],y2[train]))
#training the classifier!!!!!!!!!!!!!!!!!!!!
clf_machine = machine(Standardizer()|> clf(), Xred, y_bal)
MLJBase.fit!(clf_machine);
append!(predCV,MLJ.predict(clf_machine, X_test[:,names(Xred)]));
append!(yCV,yshuffle[test]);
end #end numfolds
#Obtaining different evaluation metrics
return predCV,yCV
end
for m in [models(matching(X1, y1))[1:5];(name="NeuralNetworkClassifier",package_name="MLJFlux")]#Threads.@threads #NeuralNetworkClassifier pkg=MLJFlux
model_name=m.name
package_name=m.package_name
eval(:(clf = @load $model_name pkg=$package_name verbosity=0))
for nn=1:numMontecarlos
if m.name≠"DSADDetector"
if m.name≠"ESADDetector"#è unsupervised
if m.name≠"GaussianProcessClassifier"#non funzia!
if m.name≠"PerceptronClassifier" #non funziona!!
(predCV,yCV)=modelCVeval(clf,X,y,y1,y2)
if m.name≠"RidgeCVClassifier"&&m.name≠"RidgeClassifier"&&m.name≠"SVC"&&m.name≠"SVMClassifier"&&m.name≠"SVMLinearClassifier"&&m.name≠"SVMNuClassifier"&&m.name≠"SGDClassifier"&&m.name≠"PassiveAggressiveClassifier"&&m.name≠"NuSVC"&&m.name≠"LinearSVC"&&m.name≠"DeterministicConstantClassifier"
acc = MLJ.accuracy(mode.(predCV), yCV)
#f1_score = MLJ.f1score(mode.(predCV), yCV)
sens = MLJBase.true_positive_rate(parse.(Int,string.(mode.(predCV))), parse.(Int,string.(yCV)))
f1_score =MLJ.f1score(parse.(Int,string.(mode.(predCV))), parse.(Int,string.(yCV)))
spec = MLJBase.true_negative_rate(parse.(Int,string.(mode.(predCV))), parse.(Int,string.(yCV)))
end
if m.name=="RidgeCVClassifier"||m.name=="RidgeClassifier"||m.name=="SVC"||m.name=="SVMClassifier"||m.name=="SVMLinearClassifier"||m.name=="SVMNuClassifier"||m.name=="SGDClassifier"||m.name=="PassiveAggressiveClassifier"||m.name=="NuSVC"||m.name=="LinearSVC"||m.name=="DeterministicConstantClassifier"
acc = MLJ.accuracy((predCV), yCV)
#f1_score = MLJ.f1score(mode.(predCV), yCV)
sens = MLJBase.true_positive_rate(parse.(Int,string.((predCV))), parse.(Int,string.(yCV)))
f1_score =MLJ.f1score(parse.(Int,string.((predCV))), parse.(Int,string.(yCV)))
spec = MLJBase.true_negative_rate(parse.(Int,string.((predCV))), parse.(Int,string.(yCV)))
end
push!(model_names, m.name)
push!(package_names,m.package_name)
append!(loss_acc, acc)
append!(loss_sens, sens)
append!(loss_spec, spec)
end;end;end;end;
end
end
elapsed = time() - start
learners=vcat(DataFrame(mname=model_names,pname=package_names,sensCV=loss_sens,specCV=loss_spec,accCV=loss_acc))
@show learners
@show elapsed
@combine(DataFrames.groupby(learners,[:mname,:pname]),:aveacc=mean(:accCV))
#CSV.write("File Name.csv", learners)
XLSX.writetable("df.xlsx", collect(DataFrames.eachcol(learners)), DataFrames.names(learners))
`
from shapml.jl.
Thanks for this. Ours is a multi class classification problem, so I will have to have a little play around and see if I can adapt it to that. Can you think of any reason why that shouldn't work?
from shapml.jl.
no, consider that shapley is not designed for binary class, but for regression. So I don't see reason to not work with multiclass
my code is designed to predict the "outcome" column of the csv file which is 0 or 1
from shapml.jl.
Related Issues (11)
- Add additivity constraint HOT 1
- Error installing ShapML in windows HOT 3
- Error "MethodError: no method matching DataFrame (:: Matrix {Float64}, :: Symbol)"
- Update MLJ examples HOT 1
- TagBot trigger issue HOT 1
- ShapML not working with @load RandomForestClassifier pkg = DecisionTree HOT 3
- ERROR: OutOfMemoryError()Worker 2 terminated. HOT 2
- Please extend your DataFrames [compat] to "^0.21" HOT 6
- Warning: left joining data frames using join is deprecated, use `leftjoin(df1, df2, on=[:index, :feature_name], makeunique=false, indicator=nothing, validate=(false, false))` instead
- Support for DataFrames 22.0? HOT 11
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from shapml.jl.