How to get most informative features for scikit-learn classifiers?
With the help of larsmans code I came up with this code for the binary case:
def show_most_informative_features(vectorizer, clf, n=20):
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)
How to get most informative features for scikit-learn classifier for different class?
In the case of binary classification, it seems like the coefficient array has been flatten.
Let's try to relabel our data with only two labels:
import codecs, re, time
from itertools import chain
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
trainfile = 'train.txt'
# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','bs','pt']
# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)
print mnb.classes_
print mnb.coef_[0]
print mnb.coef_[1]
[out]:
['bs' 'pt']
[-5.55682806 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -5.55682806
-4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088 -4.86368088
-4.1705337 -5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -5.55682806 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.45821577 -4.86368088
-4.86368088 -4.45821577 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -4.1705337 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -4.45821577 -4.86368088 -4.86368088]
Traceback (most recent call last):
File "test.py", line 24, in <module>
print mnb.coef_[1]
IndexError: index 1 is out of bounds for axis 0 with size 1
So let's do some diagnostics:
print mnb.feature_count_
print mnb.coef_[0]
[out]:
[[ 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1.
1. 1. 2. 2. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 2. 1.
1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.
0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0.
1. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1.
1. 0. 0. 1. 0. 0. 0. 4. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 1. 0. 0. 0. 0.]
[ 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 3. 0. 1. 0. 1. 0.
0. 0. 1. 2. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0.
0. 0. 0. 2. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1.
1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
0. 0. 1. 1. 2. 1. 1. 2. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0.
1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0.
0. 1. 1. 0. 1. 1. 1. 3. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.
1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1.
1. 1. 0. 1. 1. 0. 1. 2. 1. 1.]]
[-5.55682806 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -5.55682806
-4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088 -4.86368088
-4.1705337 -5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -5.55682806 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.45821577 -4.86368088
-4.86368088 -4.45821577 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -4.1705337 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -4.45821577 -4.86368088 -4.86368088]
Seems like the features are counted and then when vectorized it was flattened to save memory, so let's try:
index = 0
coef_features_c1_c2 = []
for feat, c1, c2 in zip(word_vectorizer.get_feature_names(), mnb.feature_count_[0], mnb.feature_count_[1]):
coef_features_c1_c2.append(tuple([mnb.coef_[0][index], feat, c1, c2]))
index+=1
for i in sorted(coef_features_c1_c2):
print i
[out]:
(-5.5568280616995374, u'acuerdo', 1.0, 0.0)
(-5.5568280616995374, u'al', 1.0, 0.0)
(-5.5568280616995374, u'alex', 1.0, 0.0)
(-5.5568280616995374, u'algo', 1.0, 0.0)
(-5.5568280616995374, u'andaba', 1.0, 0.0)
(-5.5568280616995374, u'andrea', 1.0, 0.0)
(-5.5568280616995374, u'bien', 1.0, 0.0)
(-5.5568280616995374, u'buscando', 1.0, 0.0)
(-5.5568280616995374, u'como', 1.0, 0.0)
(-5.5568280616995374, u'con', 1.0, 0.0)
(-5.5568280616995374, u'conseguido', 1.0, 0.0)
(-5.5568280616995374, u'distancia', 1.0, 0.0)
(-5.5568280616995374, u'doprinese', 1.0, 0.0)
(-5.5568280616995374, u'es', 2.0, 0.0)
(-5.5568280616995374, u'est\xe1', 1.0, 0.0)
(-5.5568280616995374, u'eulex', 1.0, 0.0)
(-5.5568280616995374, u'excusa', 1.0, 0.0)
(-5.5568280616995374, u'fama', 1.0, 0.0)
(-5.5568280616995374, u'guasch', 1.0, 0.0)
(-5.5568280616995374, u'ha', 1.0, 0.0)
(-5.5568280616995374, u'incident', 1.0, 0.0)
(-5.5568280616995374, u'ispit', 1.0, 0.0)
(-5.5568280616995374, u'istragu', 1.0, 0.0)
(-5.5568280616995374, u'izbijanju', 1.0, 0.0)
(-5.5568280616995374, u'ja\u010danju', 1.0, 0.0)
(-5.5568280616995374, u'je', 1.0, 0.0)
(-5.5568280616995374, u'jedan', 1.0, 0.0)
(-5.5568280616995374, u'jo\u0161', 1.0, 0.0)
(-5.5568280616995374, u'kapaciteta', 1.0, 0.0)
(-5.5568280616995374, u'kosova', 1.0, 0.0)
(-5.5568280616995374, u'la', 1.0, 0.0)
(-5.5568280616995374, u'lequio', 1.0, 0.0)
(-5.5568280616995374, u'llevar', 1.0, 0.0)
(-5.5568280616995374, u'lo', 2.0, 0.0)
(-5.5568280616995374, u'misije', 1.0, 0.0)
(-5.5568280616995374, u'muy', 1.0, 0.0)
(-5.5568280616995374, u'm\xe1s', 1.0, 0.0)
(-5.5568280616995374, u'na', 1.0, 0.0)
(-5.5568280616995374, u'nada', 1.0, 0.0)
(-5.5568280616995374, u'nasilja', 1.0, 0.0)
(-5.5568280616995374, u'no', 1.0, 0.0)
(-5.5568280616995374, u'obaviti', 1.0, 0.0)
(-5.5568280616995374, u'obe\u0107ao', 1.0, 0.0)
(-5.5568280616995374, u'parecer', 1.0, 0.0)
(-5.5568280616995374, u'pone', 1.0, 0.0)
(-5.5568280616995374, u'por', 1.0, 0.0)
(-5.5568280616995374, u'po\u0161to', 1.0, 0.0)
(-5.5568280616995374, u'prava', 1.0, 0.0)
(-5.5568280616995374, u'predstavlja', 1.0, 0.0)
(-5.5568280616995374, u'pro\u0161losedmi\u010dnom', 1.0, 0.0)
(-5.5568280616995374, u'relaci\xf3n', 1.0, 0.0)
(-5.5568280616995374, u'sjeveru', 1.0, 0.0)
(-5.5568280616995374, u'taj', 1.0, 0.0)
(-5.5568280616995374, u'una', 1.0, 0.0)
(-5.5568280616995374, u'visto', 1.0, 0.0)
(-5.5568280616995374, u'vladavine', 1.0, 0.0)
(-5.5568280616995374, u'ya', 1.0, 0.0)
(-5.5568280616995374, u'\u0107e', 1.0, 0.0)
(-4.863680881139592, u'aj', 0.0, 1.0)
(-4.863680881139592, u'ajudou', 0.0, 1.0)
(-4.863680881139592, u'alpsk\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'alpy', 0.0, 1.0)
(-4.863680881139592, u'ao', 0.0, 1.0)
(-4.863680881139592, u'apresenta', 0.0, 1.0)
(-4.863680881139592, u'bl\xedzko', 0.0, 1.0)
(-4.863680881139592, u'come\xe7o', 0.0, 1.0)
(-4.863680881139592, u'da', 2.0, 1.0)
(-4.863680881139592, u'decepcionantes', 0.0, 1.0)
(-4.863680881139592, u'deti', 0.0, 1.0)
(-4.863680881139592, u'dificuldades', 0.0, 1.0)
(-4.863680881139592, u'dif\xedcil', 1.0, 1.0)
(-4.863680881139592, u'do', 0.0, 1.0)
(-4.863680881139592, u'druh', 0.0, 1.0)
(-4.863680881139592, u'd\xe1', 0.0, 1.0)
(-4.863680881139592, u'ela', 0.0, 1.0)
(-4.863680881139592, u'encontrar', 0.0, 1.0)
(-4.863680881139592, u'enfrentar', 0.0, 1.0)
(-4.863680881139592, u'for\xe7as', 0.0, 1.0)
(-4.863680881139592, u'furiosa', 0.0, 1.0)
(-4.863680881139592, u'golf', 0.0, 1.0)
(-4.863680881139592, u'golfistami', 0.0, 1.0)
(-4.863680881139592, u'golfov\xfdch', 0.0, 1.0)
(-4.863680881139592, u'hotelmi', 0.0, 1.0)
(-4.863680881139592, u'hra\u0165', 0.0, 1.0)
(-4.863680881139592, u'ide', 0.0, 1.0)
(-4.863680881139592, u'ihr\xedsk', 0.0, 1.0)
(-4.863680881139592, u'intranspon\xedveis', 0.0, 1.0)
(-4.863680881139592, u'in\xedcio', 0.0, 1.0)
(-4.863680881139592, u'in\xfd', 0.0, 1.0)
(-4.863680881139592, u'kde', 0.0, 1.0)
(-4.863680881139592, u'kombin\xe1cie', 0.0, 1.0)
(-4.863680881139592, u'komplex', 0.0, 1.0)
(-4.863680881139592, u'kon\u010diarmi', 0.0, 1.0)
(-4.863680881139592, u'lado', 0.0, 1.0)
(-4.863680881139592, u'lete', 0.0, 1.0)
(-4.863680881139592, u'longo', 0.0, 1.0)
(-4.863680881139592, u'ly\u017eova\u0165', 0.0, 1.0)
(-4.863680881139592, u'man\u017eelky', 0.0, 1.0)
(-4.863680881139592, u'mas', 0.0, 1.0)
(-4.863680881139592, u'mesmo', 0.0, 1.0)
(-4.863680881139592, u'meu', 0.0, 1.0)
(-4.863680881139592, u'minha', 0.0, 1.0)
(-4.863680881139592, u'mo\u017enos\u0165ami', 0.0, 1.0)
(-4.863680881139592, u'm\xe3e', 0.0, 1.0)
(-4.863680881139592, u'nad\u0161en\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'negativas', 0.0, 1.0)
(-4.863680881139592, u'nie', 0.0, 1.0)
(-4.863680881139592, u'nieko\u013ek\xfdch', 0.0, 1.0)
(-4.863680881139592, u'para', 0.0, 1.0)
(-4.863680881139592, u'parecem', 0.0, 1.0)
(-4.863680881139592, u'pod', 0.0, 1.0)
(-4.863680881139592, u'pon\xfakaj\xfa', 0.0, 1.0)
(-4.863680881139592, u'potrebuj\xfa', 0.0, 1.0)
(-4.863680881139592, u'pri', 0.0, 1.0)
(-4.863680881139592, u'prova\xe7\xf5es', 0.0, 1.0)
(-4.863680881139592, u'punham', 0.0, 1.0)
(-4.863680881139592, u'qual', 0.0, 1.0)
(-4.863680881139592, u'qualquer', 0.0, 1.0)
(-4.863680881139592, u'quem', 0.0, 1.0)
(-4.863680881139592, u'rak\xfaske', 0.0, 1.0)
(-4.863680881139592, u'rezortov', 0.0, 1.0)
(-4.863680881139592, u'sa', 0.0, 1.0)
(-4.863680881139592, u'sebe', 0.0, 1.0)
(-4.863680881139592, u'sempre', 0.0, 1.0)
(-4.863680881139592, u'situa\xe7\xf5es', 0.0, 1.0)
(-4.863680881139592, u'spojen\xfdch', 0.0, 1.0)
(-4.863680881139592, u'suplantar', 0.0, 1.0)
(-4.863680881139592, u's\xfa', 0.0, 1.0)
(-4.863680881139592, u'tak', 0.0, 1.0)
(-4.863680881139592, u'talianske', 0.0, 1.0)
(-4.863680881139592, u'teve', 0.0, 1.0)
(-4.863680881139592, u'tive', 0.0, 1.0)
(-4.863680881139592, u'todas', 0.0, 1.0)
(-4.863680881139592, u'tr\xe1venia', 0.0, 1.0)
(-4.863680881139592, u've\u013ek\xfd', 0.0, 1.0)
(-4.863680881139592, u'vida', 0.0, 1.0)
(-4.863680881139592, u'vo', 0.0, 1.0)
(-4.863680881139592, u'vo\u013en\xe9ho', 0.0, 1.0)
(-4.863680881139592, u'vysok\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'vy\u017eitia', 0.0, 1.0)
(-4.863680881139592, u'v\xe4\u010d\u0161ine', 0.0, 1.0)
(-4.863680881139592, u'v\u017edy', 0.0, 1.0)
(-4.863680881139592, u'zauj\xedmav\xe9', 0.0, 1.0)
(-4.863680881139592, u'zime', 0.0, 1.0)
(-4.863680881139592, u'\u010dasu', 0.0, 1.0)
(-4.863680881139592, u'\u010fal\u0161\xedmi', 0.0, 1.0)
(-4.863680881139592, u'\u0161vaj\u010diarske', 0.0, 1.0)
(-4.4582157730314274, u'de', 2.0, 2.0)
(-4.4582157730314274, u'foi', 0.0, 2.0)
(-4.4582157730314274, u'mais', 0.0, 2.0)
(-4.4582157730314274, u'me', 0.0, 2.0)
(-4.4582157730314274, u'\u010di', 0.0, 2.0)
(-4.1705337005796466, u'as', 0.0, 3.0)
(-4.1705337005796466, u'que', 4.0, 3.0)
Now we see some patterns... Seems like the higher coefficient favors a class and the other tail favors the other, so you can simply do this:
import codecs, re, time
from itertools import chain
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
trainfile = 'train.txt'
# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','bs','pt']
# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
class_labels = classifier.classes_
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
for coef, feat in topn_class1:
print class_labels[0], coef, feat
print
for coef, feat in reversed(topn_class2):
print class_labels[1], coef, feat
most_informative_feature_for_binary_classification(word_vectorizer, mnb)
[out]:
bs -5.5568280617 acuerdo
bs -5.5568280617 al
bs -5.5568280617 alex
bs -5.5568280617 algo
bs -5.5568280617 andaba
bs -5.5568280617 andrea
bs -5.5568280617 bien
bs -5.5568280617 buscando
bs -5.5568280617 como
bs -5.5568280617 con
pt -4.17053370058 que
pt -4.17053370058 as
pt -4.45821577303 či
pt -4.45821577303 me
pt -4.45821577303 mais
pt -4.45821577303 foi
pt -4.45821577303 de
pt -4.86368088114 švajčiarske
pt -4.86368088114 ďalšími
pt -4.86368088114 času
Actually if you've read @larsmans comment carefully, he gave the hint on the binary classes' coefficient in How to get most informative features for scikit-learn classifiers?
Problems obtaining most informative features with scikit learn?
To solve this specifically for linear SVM, we first have to understand the formulation of the SVM in sklearn and the differences that it has to MultinomialNB.
The reason why the most_informative_feature_for_class
works for MultinomialNB is because the output of the coef_
is essentially the log probability of features given a class (and hence would be of size [nclass, n_features]
, due to the formulation of the naive bayes problem. But if we check the documentation for SVM, the coef_
is not that simple. Instead coef_
for (linear) SVM is [n_classes * (n_classes -1)/2, n_features]
because each of the binary models are fitted to every possible class.
If we do possess some knowledge on which particular coefficient we're interested in, we could alter the function to look like the following:
def most_informative_feature_for_class_svm(vectorizer, classifier, classlabel, n=10):
labelid = ?? # this is the coef we're interested in.
feature_names = vectorizer.get_feature_names()
svm_coef = classifier.coef_.toarray()
topn = sorted(zip(svm_coef[labelid], feature_names))[-n:]
for coef, feat in topn:
print feat, coef
This would work as intended and print out the labels and the top n features according to the coefficient vector that you're after.
As for getting the correct output for a particular class, that would depend on the assumptions and what you aim to output. I suggest reading through the multi-class documentation within the SVM documentation to get a feel for what you're after.
So using the train.txt
file which was described in this question, we can get some kind of output, though in this situation it isn't particularly descriptive or helpful to interpret. Hopefully this helps you.
import codecs, re, time
from itertools import chain
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
trainfile = 'train.txt'
# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','es','sr']
# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)
from sklearn.svm import SVC
svcc = SVC(kernel='linear', C=1)
svcc.fit(trainset, tags)
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
labelid = list(classifier.classes_).index(classlabel)
feature_names = vectorizer.get_feature_names()
topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]
for coef, feat in topn:
print classlabel, feat, coef
def most_informative_feature_for_class_svm(vectorizer, classifier, n=10):
labelid = 3 # this is the coef we're interested in.
feature_names = vectorizer.get_feature_names()
svm_coef = classifier.coef_.toarray()
topn = sorted(zip(svm_coef[labelid], feature_names))[-n:]
for coef, feat in topn:
print feat, coef
most_informative_feature_for_class(word_vectorizer, mnb, 'pt')
print
most_informative_feature_for_class_svm(word_vectorizer, svcc)
with output:
pt teve -4.63472898823
pt tive -4.63472898823
pt todas -4.63472898823
pt vida -4.63472898823
pt de -4.22926388012
pt foi -4.22926388012
pt mais -4.22926388012
pt me -4.22926388012
pt as -3.94158180767
pt que -3.94158180767
no 0.0204081632653
parecer 0.0204081632653
pone 0.0204081632653
por 0.0204081632653
relación 0.0204081632653
una 0.0204081632653
visto 0.0204081632653
ya 0.0204081632653
es 0.0408163265306
lo 0.0408163265306
Determining the most contributing features for SVM classifier in sklearn
Yes, there is attribute coef_
for SVM classifier but it only works for SVM with linear kernel. For other kernels it is not possible because data are transformed by kernel method to another space, which is not related to input space, check the explanation.
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svm = svm.SVC(kernel='linear')
svm.fit(X, Y)
f_importances(svm.coef_, features_names)
And the output of the function looks like this:
Related Topics
How to Scrape a Website Which Requires Login Using Python and Beautifulsoup
Nested for Loops Using List Comprehension
Python Global Exception Handling
Link Atlas/Mkl to an Installed Numpy
Bin Elements Per Row - Vectorized 2D Bincount for Numpy
Pass a Parameter to a Fixture Function
Compiling with Cython and Mingw Produces Gcc: Error: Unrecognized Command Line Option '-Mno-Cygwin'
What Is the Maximum Float in Python
How to Run an Ipython Magic from a Script (Or Timing a Python Script)
Extrapolate Values in Pandas Dataframe
Does Python Support Multiprocessor/Multicore Programming
Group by & Count Function in SQLalchemy
Why Does Defining _Getitem_ on a Class Make It Iterable in Python
How to Remove the Left Part of a String
Matplotlib and Ipython-Notebook: Displaying Exactly the Figure That Will Be Saved