nltk.org/book/ch06.html
gender_features word last_letter word[-1] word >>> def gender_features(word):... return { last_letter : word[-1]} >>> gender_features( Shrek ) { last_letter : k }
nltk.corpus.names nltk.download() labeled_names (u Aaron, male ) (u Zoe, female ) random.shuffle() >>> from nltk.corpus import names >>> labeled_names = (... [(name, male ) for name in names.words( male.txt )] +... [(name, female ) for name in names.words( female.txt )]) >>> import random >>> random.shuffle(labeled_names)
featuresets ({ last_letter : n}, male ) ({ last_letter : e}, female ) classifier nltk.naivebayesclassifier >>> featuresets = [(gender_features(n), gender)... for (n, gender) in labeled_names] >>> train_set, test_set = featuresets[500:], featuresets[:500] >>> classifier = nltk.naivebayesclassifier.train(train_set)
classify() >>> classifier.classify(gender_features( Neo )) male >>> classifier.classify(gender_features( Trinity )) female nltk.classify.accuracy() >>> print(nltk.classify.accuracy(classifier, test_set)) 0.77
.show_most_informative_features() >>> classifier.show_most_informative_features(5) Most Informative Features last_letter = a female : male = 33.2 : 1.0 last_letter = k male : female = 32.6 : 1.0 last_letter = p male : female = 19.7 : 1.0 last_letter = v male : female = 18.6 : 1.0 last_letter = f male : female = 17.3 : 1.0
0.77 last_letter first_letter count(letter) has(letter >>> def gender_features2(name):... features = {}... features[ first_letter ] = name[0].lower()... features[ last_letter ] = name[-1].lower()... for letter in abcdefghijklmnopqrstuvwxyz :... features[ count({}).format(letter)] = name.lower().count(letter)... features[ has({}).format(letter)] = (letter in name.lower())... return features >>> >>> gender_features2( John ) { count(j) : 1, has(d) : False, count(b) : 0,...}
gender_features() gender_features2() gender_features2() 0.768 0.77 >>> featuresets = [(gender_features2(n), gender)... for (n, gender) in labeled_names] >>> train_set, test_set = featuresets[500:], featuresets[:500] >>> classifier = nltk.naivebayesclassifier.train(train_set) >>> print(nltk.classify.accuracy(classifier, test_set)) 0.768
0.75 >>> train_names = labeled_names[1500:] >>> devtest_names = labeled_names[500:1500] >>> test_names = labeled_names[:500] >>> train_set = [(gender_features(n), gender)... for (n, gender) in train_names] >>> devtest_set = [(gender_features(n), gender)... for (n, gender) in devtest_names] >>> test_set = [(gender_features(n), gender)... for (n, gender) in test_names] >>> classifier = nltk.naivebayesclassifier.train(train_set) >>> print(nltk.classify.accuracy(classifier, devtest_set)) 0.75
tag guess >>> errors = [] >>> for (name, tag) in devtest_names:... guess = classifier.classify(gender_features(name))... if guess!= tag:... errors.append( (tag, guess, name) )
>>> for (tag, guess, name) in sorted(errors):... print( correct={:<8} guess={:<8s} name={:<30}.... format(tag, guess, name)) correct=female guess=male name=abigail... correct=female guess=male name=cindelyn... correct=female guess=male name=katheryn correct=female guess=male name=kathryn... correct=male guess=female name=aldrich... correct=male guess=female name=mitch... correct=male guess=female name=rich...
0.782 0.77 >>> def gender_features(word):... return { suffix1 : word[-1:],... suffix2 : word[-2:]} >>> train_set = [(gender_features(n), gender)... for (n, gender) in train_names] >>> devtest_set = [(gender_features(n), gender)... for (n, gender) in devtest_names] >>> classifier = nltk.naivebayesclassifier.train(train_set) >>> print(nltk.classify.accuracy(classifier, devtest_set)) 0.782
nltk.corpus.movie_reviews documents ([u plot,..., u echoes ], u neg ) ([u if, u there,...], u pos ) random.shuffle() >>> from nltk.corpus import movie_reviews >>> documents = [(list(movie_reviews.words(fileid)), category)... for category in movie_reviews.categories()... for fileid in movie_reviews.fileids(category)] >>> random.shuffle(documents)
nltk.freqdist() contains(word) (True/False) >>> all_words = nltk.freqdist(w.lower() for w in movie_reviews.words()) >>> word_features = list(all_words)[:2000] [1] >>> >>> def document_features(document): [2]... document_words = set(document) [3]... features = {}... for word in word_features:... features[ contains({}).format(word)] = (word in document_words)... return features >>> print(document_features(movie_reviews.words( pos/cv957_8737.txt ))) { contains(waste) : False, contains(lot) : False,...}
0.81 >>> featuresets = [(document_features(d), c) for (d,c) in documents] >>> train_set, test_set = featuresets[100:], featuresets[:100] >>> classifier = nltk.naivebayesclassifier.train(train_set) >>> print(nltk.classify.accuracy(classifier, test_set)) 0.81 >>> classifier.show_most_informative_features(5) Most Informative Features contains(outstanding) = True pos : neg = 11.1 : 1.0 contains(seagal) = True neg : pos = 7.7 : 1.0 contains(wonderfully) = True pos : neg = 6.8 : 1.0 contains(damon) = True pos : neg = 5.9 : 1.0 contains(wasted) = True neg : pos = 5.8 : 1.0
random.shuffle(tagged_sents) >>> import random >>> from nltk.corpus import brown >>> tagged_sents = list(brown.tagged_sents(categories= news )) >>> random.shuffle(tagged_sents) >>> size = int(len(tagged_sents) * 0.1) >>> train_set, test_set = tagged_sents[size:], tagged_sents[:size]
file_ids >>> file_ids = brown.fileids(categories= news ) >>> size = int(len(file_ids) * 0.1) >>> train_set = brown.tagged_sents(file_ids[size:]) >>> test_set = brown.tagged_sents(file_ids[:size])
>>> train_set = brown.tagged_sents(categories= news ) >>> test_set = brown.tagged_sents(categories= fiction )
= + + + + =
= + + + + = 990 + 0 990 + 10 + 0 + 0 = 990 1000 = 99%
>>> def tag_list(tagged_sents):... return [tag for sent in tagged_sents for (word, tag) in sent] >>> def apply_tagger(tagger, corpus):... return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus] >>> gold = tag_list(brown.tagged_sents(categories= editorial )) >>> test = tag_list(apply_tagger(t2,... brown.tagged_sents(categories= editorial ))) >>> cm = nltk.confusionmatrix(gold, test) >>> print(cm.pretty_format(sort_by_count=true, show_percents=true, truncate=9)) N N I A J N V N N N T J. S, B P ----+----------------------------------------------------------------+ NN <11.8%> 0.0%. 0.2%. 0.0%. 0.3% 0.0% IN 0.0% <9.0%>... 0.0%... AT.. <8.6%>...... JJ 1.7%.. <3.9%>... 0.0% 0.0%..... <4.8%>.... NNS 1.5%.... <3.2%>.. 0.0%,...... <4.4%>.. VB 0.9%.. 0.0%... <2.4%>. NP 1.0%.. 0.0%.... <1.8%>
http://www-rohan.sdsu.edu/~gawron/compling/course_ core/assignments/new_maxent_assignment.htm
senseval-hard.xml word= hard-a HARD1 20
call_extract_event.py 1 0 senseval-hard.xml senseval-hard.evt
call_maxent.py classify.batch_classify() classify.classify_many()
Testing classifier... Accuracy: 0.8220 Total: 410 Label Precision Recall HARD1 0.822 1.000 HARD2 0.000 0.000 HARD3 0.000 0.000 Label Num Corr HARD1 337 HARD2 0 HARD3 0
Testing classifier... Accuracy: 0.8220 Total: 410 Label Precision Recall HARD1 0.822 1.000 HARD2 0.000 0.000 HARD3 0.000 0.000 Label Num Corr HARD1 337 HARD2 0 HARD3 0
def extract_vocab(event_list, n=100): # Google s stoplist with most preps removed. and added, word added stopwords = [ I, a, an, are, as, and, be, com, how, is, it, of, or, that, the, this, was, what, when, where, who, will, with, the, www, was ] vocab = Counter() for (s_inst, sense) in event_list: for (i,item) in enumerate(s_inst.context): if i == int(s_inst.position): continue (item, wd, pos) = get_lex_components(item) if wd in stopwords: continue if pos in [ PRP, IN, CC, DT ]: continue vocab[item] += 1 il = vocab.items() il.sort(key=lambda x: x[1],reverse=true) il = il[:n] vocab = dict(il) return vocab
featsets = [({ feat1 : val11, feat2 : val21,...}, class1 ), ({ feat1 : val12, feat2 : val22,...}, class2 ),..., ({ feat1 : val1n, feat2 : val2n,...}, classn )] N = len(featsets) train_set = featsets[:int(n*0.9)] test_set = featsets[int(n*0.9):] train classifier = nltk.classify.naivebayes.naivebayesclassifier classifier = nltk.classify.maxent.maxentclassifier classifier.train(train_set) dir(nltk.classify) nltk.classify.accuracy(classifier, test_set))