#--coding:utf-8-- #code by myhaspl from __future__ import unicode_literals from __future__ import division import nltk import sys sys.path.append("../") import jieba def cutstring(txt): #分词http://blog.csdn.net/myhaspl cutstr = jieba.cut(txt) result=" ".join(cutstr) return result #读取文件http://blog.csdn.net/myhaspl txtfileobject = open('test2.txt','r') try: filestr = txtfileobject.read( ) finally: txtfileobject.close( ) cutstr=cutstring(filestr) tokenstr=nltk.word_tokenize(cutstr) fdist=nltk.FreqDist(tokenstr) #以词长为元素,计算不同词长的频率 http://blog.csdn.net/myhaspl print "----词频-----" fdist1=nltk.FreqDist([len(w) for w in tokenstr]) for w,c in fdist1.items(): print w,"=>",c,"||", #词长http://blog.csdn.net/myhaspl print print "----词长-----" print fdist1.keys() #词http://blog.csdn.net/myhaspl print print "---词频---" fdist2=nltk.FreqDist(tokenstr) for w,c in fdist2.items(): print w,"=>",c,"||",
----词频-----
1 => 750 || 2 => 864 || 3 => 80 || 4 => 28 || 5 => 2 || 6 => 1 ||
----词长-----
[1, 2, 3, 4, 5, 6]
---词频---
要 => 2 || 大脑皮层 => 2 || 一切 => 3 || 无意识 => 1 || 加快 => 1 || 一方面 => 1 || 通过 => 2 || 特性 => 1 || 电视观众 => 1 || 窗 => 1 || 圣哲 => 1 || 会 => 16 || 神经科学 => 1 || 被 => 3 ||