How to Get Most Informative Features for Scikit-Learn Classifiers

How to get most informative features for scikit-learn classifiers?

With the help of larsmans code I came up with this code for the binary case:

def show_most_informative_features(vectorizer, clf, n=20):
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)

How to get most informative features for scikit-learn classifier for different class?

In the case of binary classification, it seems like the coefficient array has been flatten.

Let's try to relabel our data with only two labels:

import codecs, re, time
from itertools import chain

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

trainfile = 'train.txt'

# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','bs','pt']

# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)

print mnb.classes_
print mnb.coef_[0]
print mnb.coef_[1]

[out]:

['bs' 'pt']
[-5.55682806 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -5.55682806
-4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088 -4.86368088
-4.1705337 -5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -5.55682806 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.45821577 -4.86368088
-4.86368088 -4.45821577 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -4.1705337 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -4.45821577 -4.86368088 -4.86368088]
Traceback (most recent call last):
File "test.py", line 24, in <module>
print mnb.coef_[1]
IndexError: index 1 is out of bounds for axis 0 with size 1

So let's do some diagnostics:

print mnb.feature_count_
print mnb.coef_[0]

[out]:

[[ 1.  0.  0.  1.  1.  1.  0.  0.  1.  1.  0.  0.  0.  1.  0.  1.  0.  1.
1. 1. 2. 2. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 2. 1.
1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.
0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0.
1. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1.
1. 0. 0. 1. 0. 0. 0. 4. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 1. 0. 0. 0. 0.]
[ 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 3. 0. 1. 0. 1. 0.
0. 0. 1. 2. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0.
0. 0. 0. 2. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1.
1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
0. 0. 1. 1. 2. 1. 1. 2. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0.
1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0.
0. 1. 1. 0. 1. 1. 1. 3. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.
1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1.
1. 1. 0. 1. 1. 0. 1. 2. 1. 1.]]
[-5.55682806 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -5.55682806
-4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088 -4.86368088
-4.1705337 -5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -5.55682806 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.45821577 -4.86368088
-4.86368088 -4.45821577 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -4.1705337 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -4.45821577 -4.86368088 -4.86368088]

Seems like the features are counted and then when vectorized it was flattened to save memory, so let's try:

index = 0
coef_features_c1_c2 = []

for feat, c1, c2 in zip(word_vectorizer.get_feature_names(), mnb.feature_count_[0], mnb.feature_count_[1]):
coef_features_c1_c2.append(tuple([mnb.coef_[0][index], feat, c1, c2]))
index+=1

for i in sorted(coef_features_c1_c2):
print i

[out]:

(-5.5568280616995374, u'acuerdo', 1.0, 0.0)
(-5.5568280616995374, u'al', 1.0, 0.0)
(-5.5568280616995374, u'alex', 1.0, 0.0)
(-5.5568280616995374, u'algo', 1.0, 0.0)
(-5.5568280616995374, u'andaba', 1.0, 0.0)
(-5.5568280616995374, u'andrea', 1.0, 0.0)
(-5.5568280616995374, u'bien', 1.0, 0.0)
(-5.5568280616995374, u'buscando', 1.0, 0.0)
(-5.5568280616995374, u'como', 1.0, 0.0)
(-5.5568280616995374, u'con', 1.0, 0.0)
(-5.5568280616995374, u'conseguido', 1.0, 0.0)
(-5.5568280616995374, u'distancia', 1.0, 0.0)
(-5.5568280616995374, u'doprinese', 1.0, 0.0)
(-5.5568280616995374, u'es', 2.0, 0.0)
(-5.5568280616995374, u'est\xe1', 1.0, 0.0)
(-5.5568280616995374, u'eulex', 1.0, 0.0)
(-5.5568280616995374, u'excusa', 1.0, 0.0)
(-5.5568280616995374, u'fama', 1.0, 0.0)
(-5.5568280616995374, u'guasch', 1.0, 0.0)
(-5.5568280616995374, u'ha', 1.0, 0.0)
(-5.5568280616995374, u'incident', 1.0, 0.0)
(-5.5568280616995374, u'ispit', 1.0, 0.0)
(-5.5568280616995374, u'istragu', 1.0, 0.0)
(-5.5568280616995374, u'izbijanju', 1.0, 0.0)
(-5.5568280616995374, u'ja\u010danju', 1.0, 0.0)
(-5.5568280616995374, u'je', 1.0, 0.0)
(-5.5568280616995374, u'jedan', 1.0, 0.0)
(-5.5568280616995374, u'jo\u0161', 1.0, 0.0)
(-5.5568280616995374, u'kapaciteta', 1.0, 0.0)
(-5.5568280616995374, u'kosova', 1.0, 0.0)
(-5.5568280616995374, u'la', 1.0, 0.0)
(-5.5568280616995374, u'lequio', 1.0, 0.0)
(-5.5568280616995374, u'llevar', 1.0, 0.0)
(-5.5568280616995374, u'lo', 2.0, 0.0)
(-5.5568280616995374, u'misije', 1.0, 0.0)
(-5.5568280616995374, u'muy', 1.0, 0.0)
(-5.5568280616995374, u'm\xe1s', 1.0, 0.0)
(-5.5568280616995374, u'na', 1.0, 0.0)
(-5.5568280616995374, u'nada', 1.0, 0.0)
(-5.5568280616995374, u'nasilja', 1.0, 0.0)
(-5.5568280616995374, u'no', 1.0, 0.0)
(-5.5568280616995374, u'obaviti', 1.0, 0.0)
(-5.5568280616995374, u'obe\u0107ao', 1.0, 0.0)
(-5.5568280616995374, u'parecer', 1.0, 0.0)
(-5.5568280616995374, u'pone', 1.0, 0.0)
(-5.5568280616995374, u'por', 1.0, 0.0)
(-5.5568280616995374, u'po\u0161to', 1.0, 0.0)
(-5.5568280616995374, u'prava', 1.0, 0.0)
(-5.5568280616995374, u'predstavlja', 1.0, 0.0)
(-5.5568280616995374, u'pro\u0161losedmi\u010dnom', 1.0, 0.0)
(-5.5568280616995374, u'relaci\xf3n', 1.0, 0.0)
(-5.5568280616995374, u'sjeveru', 1.0, 0.0)
(-5.5568280616995374, u'taj', 1.0, 0.0)
(-5.5568280616995374, u'una', 1.0, 0.0)
(-5.5568280616995374, u'visto', 1.0, 0.0)
(-5.5568280616995374, u'vladavine', 1.0, 0.0)
(-5.5568280616995374, u'ya', 1.0, 0.0)
(-5.5568280616995374, u'\u0107e', 1.0, 0.0)
(-4.863680881139592, u'aj', 0.0, 1.0)
(-4.863680881139592, u'ajudou', 0.0, 1.0)
(-4.863680881139592, u'alpsk\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'alpy', 0.0, 1.0)
(-4.863680881139592, u'ao', 0.0, 1.0)
(-4.863680881139592, u'apresenta', 0.0, 1.0)
(-4.863680881139592, u'bl\xedzko', 0.0, 1.0)
(-4.863680881139592, u'come\xe7o', 0.0, 1.0)
(-4.863680881139592, u'da', 2.0, 1.0)
(-4.863680881139592, u'decepcionantes', 0.0, 1.0)
(-4.863680881139592, u'deti', 0.0, 1.0)
(-4.863680881139592, u'dificuldades', 0.0, 1.0)
(-4.863680881139592, u'dif\xedcil', 1.0, 1.0)
(-4.863680881139592, u'do', 0.0, 1.0)
(-4.863680881139592, u'druh', 0.0, 1.0)
(-4.863680881139592, u'd\xe1', 0.0, 1.0)
(-4.863680881139592, u'ela', 0.0, 1.0)
(-4.863680881139592, u'encontrar', 0.0, 1.0)
(-4.863680881139592, u'enfrentar', 0.0, 1.0)
(-4.863680881139592, u'for\xe7as', 0.0, 1.0)
(-4.863680881139592, u'furiosa', 0.0, 1.0)
(-4.863680881139592, u'golf', 0.0, 1.0)
(-4.863680881139592, u'golfistami', 0.0, 1.0)
(-4.863680881139592, u'golfov\xfdch', 0.0, 1.0)
(-4.863680881139592, u'hotelmi', 0.0, 1.0)
(-4.863680881139592, u'hra\u0165', 0.0, 1.0)
(-4.863680881139592, u'ide', 0.0, 1.0)
(-4.863680881139592, u'ihr\xedsk', 0.0, 1.0)
(-4.863680881139592, u'intranspon\xedveis', 0.0, 1.0)
(-4.863680881139592, u'in\xedcio', 0.0, 1.0)
(-4.863680881139592, u'in\xfd', 0.0, 1.0)
(-4.863680881139592, u'kde', 0.0, 1.0)
(-4.863680881139592, u'kombin\xe1cie', 0.0, 1.0)
(-4.863680881139592, u'komplex', 0.0, 1.0)
(-4.863680881139592, u'kon\u010diarmi', 0.0, 1.0)
(-4.863680881139592, u'lado', 0.0, 1.0)
(-4.863680881139592, u'lete', 0.0, 1.0)
(-4.863680881139592, u'longo', 0.0, 1.0)
(-4.863680881139592, u'ly\u017eova\u0165', 0.0, 1.0)
(-4.863680881139592, u'man\u017eelky', 0.0, 1.0)
(-4.863680881139592, u'mas', 0.0, 1.0)
(-4.863680881139592, u'mesmo', 0.0, 1.0)
(-4.863680881139592, u'meu', 0.0, 1.0)
(-4.863680881139592, u'minha', 0.0, 1.0)
(-4.863680881139592, u'mo\u017enos\u0165ami', 0.0, 1.0)
(-4.863680881139592, u'm\xe3e', 0.0, 1.0)
(-4.863680881139592, u'nad\u0161en\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'negativas', 0.0, 1.0)
(-4.863680881139592, u'nie', 0.0, 1.0)
(-4.863680881139592, u'nieko\u013ek\xfdch', 0.0, 1.0)
(-4.863680881139592, u'para', 0.0, 1.0)
(-4.863680881139592, u'parecem', 0.0, 1.0)
(-4.863680881139592, u'pod', 0.0, 1.0)
(-4.863680881139592, u'pon\xfakaj\xfa', 0.0, 1.0)
(-4.863680881139592, u'potrebuj\xfa', 0.0, 1.0)
(-4.863680881139592, u'pri', 0.0, 1.0)
(-4.863680881139592, u'prova\xe7\xf5es', 0.0, 1.0)
(-4.863680881139592, u'punham', 0.0, 1.0)
(-4.863680881139592, u'qual', 0.0, 1.0)
(-4.863680881139592, u'qualquer', 0.0, 1.0)
(-4.863680881139592, u'quem', 0.0, 1.0)
(-4.863680881139592, u'rak\xfaske', 0.0, 1.0)
(-4.863680881139592, u'rezortov', 0.0, 1.0)
(-4.863680881139592, u'sa', 0.0, 1.0)
(-4.863680881139592, u'sebe', 0.0, 1.0)
(-4.863680881139592, u'sempre', 0.0, 1.0)
(-4.863680881139592, u'situa\xe7\xf5es', 0.0, 1.0)
(-4.863680881139592, u'spojen\xfdch', 0.0, 1.0)
(-4.863680881139592, u'suplantar', 0.0, 1.0)
(-4.863680881139592, u's\xfa', 0.0, 1.0)
(-4.863680881139592, u'tak', 0.0, 1.0)
(-4.863680881139592, u'talianske', 0.0, 1.0)
(-4.863680881139592, u'teve', 0.0, 1.0)
(-4.863680881139592, u'tive', 0.0, 1.0)
(-4.863680881139592, u'todas', 0.0, 1.0)
(-4.863680881139592, u'tr\xe1venia', 0.0, 1.0)
(-4.863680881139592, u've\u013ek\xfd', 0.0, 1.0)
(-4.863680881139592, u'vida', 0.0, 1.0)
(-4.863680881139592, u'vo', 0.0, 1.0)
(-4.863680881139592, u'vo\u013en\xe9ho', 0.0, 1.0)
(-4.863680881139592, u'vysok\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'vy\u017eitia', 0.0, 1.0)
(-4.863680881139592, u'v\xe4\u010d\u0161ine', 0.0, 1.0)
(-4.863680881139592, u'v\u017edy', 0.0, 1.0)
(-4.863680881139592, u'zauj\xedmav\xe9', 0.0, 1.0)
(-4.863680881139592, u'zime', 0.0, 1.0)
(-4.863680881139592, u'\u010dasu', 0.0, 1.0)
(-4.863680881139592, u'\u010fal\u0161\xedmi', 0.0, 1.0)
(-4.863680881139592, u'\u0161vaj\u010diarske', 0.0, 1.0)
(-4.4582157730314274, u'de', 2.0, 2.0)
(-4.4582157730314274, u'foi', 0.0, 2.0)
(-4.4582157730314274, u'mais', 0.0, 2.0)
(-4.4582157730314274, u'me', 0.0, 2.0)
(-4.4582157730314274, u'\u010di', 0.0, 2.0)
(-4.1705337005796466, u'as', 0.0, 3.0)
(-4.1705337005796466, u'que', 4.0, 3.0)

Now we see some patterns... Seems like the higher coefficient favors a class and the other tail favors the other, so you can simply do this:

import codecs, re, time
from itertools import chain

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

trainfile = 'train.txt'

# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','bs','pt']

# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)

def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
class_labels = classifier.classes_
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

for coef, feat in topn_class1:
print class_labels[0], coef, feat

print

for coef, feat in reversed(topn_class2):
print class_labels[1], coef, feat

most_informative_feature_for_binary_classification(word_vectorizer, mnb)

[out]:

bs -5.5568280617 acuerdo
bs -5.5568280617 al
bs -5.5568280617 alex
bs -5.5568280617 algo
bs -5.5568280617 andaba
bs -5.5568280617 andrea
bs -5.5568280617 bien
bs -5.5568280617 buscando
bs -5.5568280617 como
bs -5.5568280617 con

pt -4.17053370058 que
pt -4.17053370058 as
pt -4.45821577303 či
pt -4.45821577303 me
pt -4.45821577303 mais
pt -4.45821577303 foi
pt -4.45821577303 de
pt -4.86368088114 švajčiarske
pt -4.86368088114 ďalšími
pt -4.86368088114 času

Actually if you've read @larsmans comment carefully, he gave the hint on the binary classes' coefficient in How to get most informative features for scikit-learn classifiers?

Problems obtaining most informative features with scikit learn?

To solve this specifically for linear SVM, we first have to understand the formulation of the SVM in sklearn and the differences that it has to MultinomialNB.

The reason why the most_informative_feature_for_class works for MultinomialNB is because the output of the coef_ is essentially the log probability of features given a class (and hence would be of size [nclass, n_features], due to the formulation of the naive bayes problem. But if we check the documentation for SVM, the coef_ is not that simple. Instead coef_ for (linear) SVM is [n_classes * (n_classes -1)/2, n_features] because each of the binary models are fitted to every possible class.

If we do possess some knowledge on which particular coefficient we're interested in, we could alter the function to look like the following:

def most_informative_feature_for_class_svm(vectorizer, classifier,  classlabel, n=10):
labelid = ?? # this is the coef we're interested in.
feature_names = vectorizer.get_feature_names()
svm_coef = classifier.coef_.toarray()
topn = sorted(zip(svm_coef[labelid], feature_names))[-n:]

for coef, feat in topn:
print feat, coef

This would work as intended and print out the labels and the top n features according to the coefficient vector that you're after.

As for getting the correct output for a particular class, that would depend on the assumptions and what you aim to output. I suggest reading through the multi-class documentation within the SVM documentation to get a feel for what you're after.

So using the train.txt file which was described in this question, we can get some kind of output, though in this situation it isn't particularly descriptive or helpful to interpret. Hopefully this helps you.

import codecs, re, time
from itertools import chain

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

trainfile = 'train.txt'

# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','es','sr']

# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)

from sklearn.svm import SVC
svcc = SVC(kernel='linear', C=1)
svcc.fit(trainset, tags)

def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
labelid = list(classifier.classes_).index(classlabel)
feature_names = vectorizer.get_feature_names()
topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

for coef, feat in topn:
print classlabel, feat, coef

def most_informative_feature_for_class_svm(vectorizer, classifier, n=10):
labelid = 3 # this is the coef we're interested in.
feature_names = vectorizer.get_feature_names()
svm_coef = classifier.coef_.toarray()
topn = sorted(zip(svm_coef[labelid], feature_names))[-n:]

for coef, feat in topn:
print feat, coef

most_informative_feature_for_class(word_vectorizer, mnb, 'pt')
print
most_informative_feature_for_class_svm(word_vectorizer, svcc)

with output:

pt teve -4.63472898823
pt tive -4.63472898823
pt todas -4.63472898823
pt vida -4.63472898823
pt de -4.22926388012
pt foi -4.22926388012
pt mais -4.22926388012
pt me -4.22926388012
pt as -3.94158180767
pt que -3.94158180767

no 0.0204081632653
parecer 0.0204081632653
pone 0.0204081632653
por 0.0204081632653
relación 0.0204081632653
una 0.0204081632653
visto 0.0204081632653
ya 0.0204081632653
es 0.0408163265306
lo 0.0408163265306

Determining the most contributing features for SVM classifier in sklearn

Yes, there is attribute coef_ for SVM classifier but it only works for SVM with linear kernel. For other kernels it is not possible because data are transformed by kernel method to another space, which is not related to input space, check the explanation.

from matplotlib import pyplot as plt
from sklearn import svm

def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()

features_names = ['input1', 'input2']
svm = svm.SVC(kernel='linear')
svm.fit(X, Y)
f_importances(svm.coef_, features_names)

And the output of the function looks like this:
Feature importances



Related Topics



Leave a reply



Submit