# get the lemma and part-of-speech lemma = _next_token() pos = _next_token() # get the number of synsets for this lemma n_synsets = int(_next_token()) ...
let token = util.getIterator(line.split(/\s+/).filter(v=>v!=='')); // or let token = line.split(/\s+/).filter(v=>v!=='')[Symbol.iterator](); try { // lemma and pos lemma = token.next().value; pos = token.next().value; // get how many synsets for this lemma nSynsets = parseInt(token.next().value); assert(nSynsets > 0); ...
util.getIterator = function *(array) { for (let value of array) { yield value; } }
nltk中生成ngram的方法的js实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
function *ngrams(sequence, n) { // Sadly, you cant use arrow function as generator here. let g = (function *(sequence){yield *sequence})(sequence);
let history = []; while (n > 1) { history.push(g.next().value); n--; } for (let item of g) { history.push(item); // look out here. // yield history; will yield reference. yieldArray.from(history); history.shift(); } }
functionsplitSet(s) { function *_splitAll(s) { if (s.length == 0) { return; } yield s; for (let i = 1; i < s.length; i++) { let left = s.substr(0, i); let right = s.substr(i); for (let wl of _splitAll(left)) { for (let wr of _splitAll(right)) { yield wl + " " + wr; } } } } let set = []; for (let w of _splitAll(s)){ set.push(w); } return set; }
想得很好,通过将序列分为左右两部分递归简化处理。结果发现同一分割出现了多词。
1
> splitSet("abc")
[ 'abc', 'a bc', 'a b c', 'ab c', 'a b c' ]
相同的结果需要排除,这时候ES6中的Set就很方便了…
1 2 3 4 5
let set = newSet(); for (let w of _splitAll(s)){ set.add(w); } return set;
text = 'That U.S.A. poster-print costs $12.40...' pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; ''' nltk.regexp_tokenize(text, pattern) ['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']
想了想如何在js中自己实现,忽然想到了template string这个特性。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
functionr(strings) { let returnS; // remove all space; returnS = strings[0]; let comment = /\/\/.*\n/g; returnS = returnS.replace(comment, ""); returnS = returnS.replace(/\s+/g,""); return returnS; }
r` ([A-Z]\.)+ // abbreviations, e.g. U.S.A. | \w+(-\w+)* // words with optional internal hyphens | \$?\d+(\.\d+)?%? // currency and percentages, e.g. $12.40, 82% | \.\.\. // ellipsis | [][.,;"'?():-_\`] // these are separate tokens; includes ], [ `
遗憾的是,javascript中没有这种东西。忽然想到了[ES6 in Depth: Proxies](https://hacks.mozilla.org/2015/07/es6-in-depth-proxies-and-reflect/)中的例子,想到可以依靠这个特性自己实现一个`defaultdict`。
```javascript function Defaultdict() { handler = { get: function (target, prop, receiver) { if (!(prop in target)) { target[prop] = {}; } return target[prop]; } } var p = new Proxy(this, handler); return p; }
let expandedDct = new Defaultdict(); expandedDct.a.b = 1; expandedDct.c.d = 2; console.log(expandedDct.a.b) console.log(Object.keys(expandedDct));