def wordfeatures(word): return {"cnword":word} ..... classifier=nltk.NaiveBayesClassifier.train(samplewords) #大学所属的类别 http://blog.csdn.net/myhaspl print u"----大学所属的类别-----" print classifier.classify({"cnword":u"大学"}) #大脑所属的类别http://blog.csdn.net/myhaspl print u"----大脑所属的类别-----" print classifier.classify({"cnword":u"大脑"}) #测试数据分类准确率http://blog.csdn.net/myhaspl print nltk.classify.accuracy(classifier,testwords) #特征0分类最有效的10个词http://blog.csdn.net/myhaspl for wf,mostword in classifier.most_informative_features(10): print mostword, print #为显示utf-8,将show_most_informative_features代码进行修改http://blog.csdn.net/myhaspl #classifier.show_most_informative_features(10) 也可直接调用这句,但是UTF8显示有问题 http://blog.csdn.net/myhaspl cpdist = classifier._feature_probdist print('Most Informative Features') for (fname, fval) in classifier.most_informative_features(10): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted([l for l in classifier._labels if fval in cpdist[l, fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0, fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)) print fname+"="+fval, print(('%6s : %-6s = %s : 1.0' % (("%s" % l1)[:6], ("%s" % l0)[:6], ratio))) 运行结果: = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =>| ----大学所属的类别----- 教育 ----大脑所属的类别----- 科技 0.977346278317 世界 公司 事先 游戏 之后 领域 采用 学科 里面 技术 Most Informative Features cnword=世界 科技 : 教育 = 20.6 : 1.0 cnword=公司 科技 : 教育 = 12.4 : 1.0 cnword=事先 科技 : 教育 = 5.8 : 1.0 cnword=游戏 科技 : 教育 = 5.8 : 1.0 cnword=之后 科技 : 教育 = 4.5 : 1.0 cnword=领域 科技 : 教育 = 4.5 : 1.0 cnword=采用 科技 : 教育 = 4.5 : 1.0 cnword=学科 科技 : 教育 = 4.1 : 1.0 cnword=里面 科技 : 教育 = 4.1 : 1.0 cnword=技术 科技 : 教育 = 4.1 : 1.0
朴素贝叶斯分类,对词条分类如上