NLTK

NLTK

NLTK - Natural Language Tool Kit

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('state_union')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('movie_reviews')
nltk.download('vader_lexicon')
# if a pop up does not show do that on pycharm!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...





True

Preprocessing the text

Tokenizing

  • Word tokenizers
  • Sentence tokenizers
  • Corpora: body of the text and language or topic
  • Lexicon: investor speak “bull” = positive while for english speaker “bull” = scary animal
from nltk.tokenize import sent_tokenize, word_tokenize
eg = "Im here working on this NLTK packages. in ten minutes i will have the CSS lesson, what A"

Split by sentences.

We can see at the point, capital letter and so on. but if we have as as Mr.Smith? that is not a new sentence

sent_tokenize(eg)
['Im here working on this NLTK packages.',
 'in ten minutes i will have the CSS lesson, what A']

Stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop = set(stopwords.words("english"))

To be sure that we are not using word inside the sed we must perform a check one by one:

filtered = [word for word in stop]
filtered[0:3]
['between', 'do', 'then']

Stemming

  • take the root stemm of the word! reading read
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer() # Create the Obj PorterStemmer!
print(type(ps))
<class 'nltk.stem.porter.PorterStemmer'>
example = ["try","trying","tryed","tryly"]
for word in example:
    print(ps.stem(word))
tri
tri
tri
tryli

Part of Speech tagging

Create a tuple with the word and what is in the prashe!

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom = PunktSentenceTokenizer(train) # we can skip this part
tokenized = custom.tokenize(sample_text)
for idx, i in enumerate(tokenized):
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print(tagged)
    if idx > 10: break
[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nation', 'NN'), ('lost', 'VBD'), ('a', 'DT'), ('beloved', 'VBN'), (',', ','), ('graceful', 'JJ'), (',', ','), ('courageous', 'JJ'), ('woman', 'NN'), ('who', 'WP'), ('called', 'VBD'), ('America', 'NNP'), ('to', 'TO'), ('its', 'PRP$'), ('founding', 'NN'), ('ideals', 'NNS'), ('and', 'CC'), ('carried', 'VBD'), ('on', 'IN'), ('a', 'DT'), ('noble', 'JJ'), ('dream', 'NN'), ('.', '.')]
[('Tonight', 'NN'), ('we', 'PRP'), ('are', 'VBP'), ('comforted', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('hope', 'NN'), ('of', 'IN'), ('a', 'DT'), ('glad', 'JJ'), ('reunion', 'NN'), ('with', 'IN'), ('the', 'DT'), ('husband', 'NN'), ('who', 'WP'), ('was', 'VBD'), ('taken', 'VBN'), ('so', 'RB'), ('long', 'RB'), ('ago', 'RB'), (',', ','), ('and', 'CC'), ('we', 'PRP'), ('are', 'VBP'), ('grateful', 'JJ'), ('for', 'IN'), ('the', 'DT'), ('good', 'JJ'), ('life', 'NN'), ('of', 'IN'), ('Coretta', 'NNP'), ('Scott', 'NNP'), ('King', 'NNP'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('President', 'NNP'), ('George', 'NNP'), ('W.', 'NNP'), ('Bush', 'NNP'), ('reacts', 'VBZ'), ('to', 'TO'), ('applause', 'VB'), ('during', 'IN'), ('his', 'PRP$'), ('State', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Union', 'NNP'), ('Address', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('Capitol', 'NNP'), (',', ','), ('Tuesday', 'NNP'), (',', ','), ('Jan', 'NNP'), ('.', '.')]
[('31', 'CD'), (',', ','), ('2006', 'CD'), ('.', '.')]
[('White', 'NNP'), ('House', 'NNP'), ('photo', 'NN'), ('by', 'IN'), ('Eric', 'NNP'), ('DraperEvery', 'NNP'), ('time', 'NN'), ('I', 'PRP'), ("'m", 'VBP'), ('invited', 'JJ'), ('to', 'TO'), ('this', 'DT'), ('rostrum', 'NN'), (',', ','), ('I', 'PRP'), ("'m", 'VBP'), ('humbled', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('privilege', 'NN'), (',', ','), ('and', 'CC'), ('mindful', 'NN'), ('of', 'IN'), ('the', 'DT'), ('history', 'NN'), ('we', 'PRP'), ("'ve", 'VBP'), ('seen', 'VBN'), ('together', 'RB'), ('.', '.')]
[('We', 'PRP'), ('have', 'VBP'), ('gathered', 'VBN'), ('under', 'IN'), ('this', 'DT'), ('Capitol', 'NNP'), ('dome', 'NN'), ('in', 'IN'), ('moments', 'NNS'), ('of', 'IN'), ('national', 'JJ'), ('mourning', 'NN'), ('and', 'CC'), ('national', 'JJ'), ('achievement', 'NN'), ('.', '.')]
[('We', 'PRP'), ('have', 'VBP'), ('served', 'VBN'), ('America', 'NNP'), ('through', 'IN'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('most', 'RBS'), ('consequential', 'JJ'), ('periods', 'NNS'), ('of', 'IN'), ('our', 'PRP$'), ('history', 'NN'), ('--', ':'), ('and', 'CC'), ('it', 'PRP'), ('has', 'VBZ'), ('been', 'VBN'), ('my', 'PRP$'), ('honor', 'NN'), ('to', 'TO'), ('serve', 'VB'), ('with', 'IN'), ('you', 'PRP'), ('.', '.')]
[('In', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('two', 'CD'), ('parties', 'NNS'), (',', ','), ('two', 'CD'), ('chambers', 'NNS'), (',', ','), ('and', 'CC'), ('two', 'CD'), ('elected', 'JJ'), ('branches', 'NNS'), (',', ','), ('there', 'EX'), ('will', 'MD'), ('always', 'RB'), ('be', 'VB'), ('differences', 'NNS'), ('and', 'CC'), ('debate', 'NN'), ('.', '.')]
[('But', 'CC'), ('even', 'RB'), ('tough', 'JJ'), ('debates', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('conducted', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('civil', 'JJ'), ('tone', 'NN'), (',', ','), ('and', 'CC'), ('our', 'PRP$'), ('differences', 'NNS'), ('can', 'MD'), ('not', 'RB'), ('be', 'VB'), ('allowed', 'VBN'), ('to', 'TO'), ('harden', 'VB'), ('into', 'IN'), ('anger', 'NN'), ('.', '.')]
[('To', 'TO'), ('confront', 'VB'), ('the', 'DT'), ('great', 'JJ'), ('issues', 'NNS'), ('before', 'IN'), ('us', 'PRP'), (',', ','), ('we', 'PRP'), ('must', 'MD'), ('act', 'VB'), ('in', 'IN'), ('a', 'DT'), ('spirit', 'NN'), ('of', 'IN'), ('goodwill', 'NN'), ('and', 'CC'), ('respect', 'NN'), ('for', 'IN'), ('one', 'CD'), ('another', 'DT'), ('--', ':'), ('and', 'CC'), ('I', 'PRP'), ('will', 'MD'), ('do', 'VB'), ('my', 'PRP$'), ('part', 'NN'), ('.', '.')]
  • CC coordinating conjunction
  • CD cardinal digit
  • DT determiner
  • EX existential there (like: “there is” … think of it like “there exists”)
  • FW foreign word
  • IN preposition/subordinating conjunction
  • JJ adjective ‘big’
  • JJR adjective, comparative ‘bigger’
  • JJS adjective, superlative ‘biggest’
  • LS list marker 1)
  • MD modal could, will
  • NN noun, singular ‘desk’
  • NNS noun plural ‘desks’
  • NNP proper noun, singular ‘Harrison’
  • NNPS proper noun, plural ‘Americans’
  • PDT predeterminer ‘all the kids’
  • POS possessive ending parent’s
  • PRP personal pronoun I, he, she
  • PRP possessive pronoun my, his, hers
  • RB adverb very, silently,
  • RBR adverb, comparative better
  • RBS adverb, superlative best
  • RP particle give up
  • TO, to go ‘to’ the store.
  • UH interjection, errrrrrrrm
  • VB verb, base form take
  • VBD verb, past tense, took
  • VBG verb, gerund/present participle taking
  • VBN verb, past participle is taken
  • VBP verb, sing. present, known-3d take
  • VBZ verb, 3rd person sing. present takes
  • WDT wh-determiner which
  • WP wh-pronoun who, what
  • WP possessive wh-pronoun whose
  • WRB wh-adverb where, when

Chunking

custom = PunktSentenceTokenizer(train) 
tokenized = custom.tokenize(sample_text)
for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    chunk = r"""Chunk: {<VB.?>} """ 
    chunkParser = nltk.RegexpParser(chunk)
    chunked = chunkParser.parse(tagged)
    print(words, "\n\n", chunked)
    break
['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'January', '31', ',', '2006', 'THE', 'PRESIDENT', ':', 'Thank', 'you', 'all', '.'] 

 (S
  PRESIDENT/NNP
  GEORGE/NNP
  W./NNP
  BUSH/NNP
  'S/POS
  ADDRESS/NNP
  BEFORE/IN
  A/NNP
  JOINT/NNP
  SESSION/NNP
  OF/IN
  THE/NNP
  CONGRESS/NNP
  ON/NNP
  THE/NNP
  STATE/NNP
  OF/IN
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)

Lemmatazing

it’s similar to stemming but in this case it return a real word!

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("goodest", pos="a"))
print(lemmatizer.lemmatize("good", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
good
good
good
best

the main argument of pos = “n” which stay for noun. if you are passing something different you must pass it!

WordNet

  • Find synonimis, meaning and so on!
from nltk.corpus import wordnet

Synset

syns = wordnet.synsets("love")
print(syns)
[Synset('love.n.01'), Synset('love.n.02'), Synset('beloved.n.01'), Synset('love.n.04'), Synset('love.n.05'), Synset('sexual_love.n.02'), Synset('love.v.01'), Synset('love.v.02'), Synset('love.v.03'), Synset('sleep_together.v.01')]
print(syns[9].lemmas())
[Lemma('sleep_together.v.01.sleep_together'), Lemma('sleep_together.v.01.roll_in_the_hay'), Lemma('sleep_together.v.01.love'), Lemma('sleep_together.v.01.make_out'), Lemma('sleep_together.v.01.make_love'), Lemma('sleep_together.v.01.sleep_with'), Lemma('sleep_together.v.01.get_laid'), Lemma('sleep_together.v.01.have_sex'), Lemma('sleep_together.v.01.know'), Lemma('sleep_together.v.01.do_it'), Lemma('sleep_together.v.01.be_intimate'), Lemma('sleep_together.v.01.have_intercourse'), Lemma('sleep_together.v.01.have_it_away'), Lemma('sleep_together.v.01.have_it_off'), Lemma('sleep_together.v.01.screw'), Lemma('sleep_together.v.01.fuck'), Lemma('sleep_together.v.01.jazz'), Lemma('sleep_together.v.01.eff'), Lemma('sleep_together.v.01.hump'), Lemma('sleep_together.v.01.lie_with'), Lemma('sleep_together.v.01.bed'), Lemma('sleep_together.v.01.have_a_go_at_it'), Lemma('sleep_together.v.01.bang'), Lemma('sleep_together.v.01.get_it_on'), Lemma('sleep_together.v.01.bonk')]

Definition

print(syns[7])
print(syns[7].lemmas())
print(syns[7])
print(syns[7].definition())
Synset('love.v.02')
[Lemma('love.v.02.love'), Lemma('love.v.02.enjoy')]
Synset('love.v.02')
get pleasure from

Examples

print(syns[7].examples())
['I love cooking']
sinonimi = []
contrari = []
for syn in wordnet.synsets("love"):
    for l in syn.lemmas():
        sinonimi.append(l)
        if l.antonyms():
            contrari.append(l.antonyms()[0].name())
sinonimi[10:15]
[Lemma('love.n.04.erotic_love'),
 Lemma('love.n.05.love'),
 Lemma('sexual_love.n.02.sexual_love'),
 Lemma('sexual_love.n.02.lovemaking'),
 Lemma('sexual_love.n.02.making_love')]

Find semantic similarity

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.lemmas())
print(w2.lemmas())
[Lemma('ship.n.01.ship')]
[Lemma('boat.n.01.boat')]
print(w1.wup_similarity(w2)) # nice similarity
0.9090909090909091

Text Classifier

In this case we will cover binary situation.

import nltk 
import random as rd # we will use that to shuffle the dt
from nltk.corpus import movie_reviews # fancy list of reviews labelled!

Load pre-labelled reviews

dir(movie_reviews)[-15:]
['__name__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_unload',
 'subdir',
 'unicode_repr']
documents = []
i = 0
for category in movie_reviews.categories():
    if i == 0:
        print(category)
    for fileid in movie_reviews.fileids(category):
        if i == 0:
            print(fileid)
        documents.append((list(movie_reviews.words(fileid)), category))
        if i == 0:
            print(documents[0][0][0:20], documents[0][1])
        i+=1
neg
neg/cv000_29416.txt
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an'] neg
rd.shuffle(documents)
print(documents[0][0][0:10], documents[0][1])
['to', 'sum', 'the', 'entire', 'film', '"', '54', '"', 'up', 'in'] neg
all_words= []
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
common = set(stopwords.words("english"))
punctuation = set([",",".",":",";","(",")","!","?","'"," \" ", "-" ])
for w in movie_reviews.words():
    if w not in punctuation and w not in common:
        all_words.append(w.lower())

Nltk FrequencyDistribution

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(5))
print(len(all_words)) # 40000 different words
[('"', 17612), ('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690)]
39607
all_words["stupid"] # is like a dictionary
253

Limit the important words

word_features = list(all_words.keys())[:3000]
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
feature_sets = [(find_features(rev),category) for (rev,category) in documents]

This feature_sets is a list of tuple having the first element a dictionary having a dictionary with all the 3000 most common words and value a boolean if they are contained in that reviews and as a second element if the reviews was positive or negative!

Fit the NaiveBayes Classifier

train_x = feature_sets[:1900]
test_x = feature_sets[1900:]
classifier = nltk.NaiveBayesClassifier.train(train_x)
print("accuracy:", nltk.classify.accuracy(classifier, test_x))
accuracy: 0.81
classifier.show_most_informative_features(10)
Most Informative Features
               atrocious = True              neg : pos    =     11.6 : 1.0
                   sucks = True              neg : pos    =      9.7 : 1.0
                     ugh = True              neg : pos    =      9.6 : 1.0
                  annual = True              pos : neg    =      9.1 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                    yawn = True              neg : pos    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.5 : 1.0
                obstacle = True              pos : neg    =      6.4 : 1.0
                 cunning = True              pos : neg    =      6.4 : 1.0
                  crappy = True              neg : pos    =      6.3 : 1.0
                  sexist = True              neg : pos    =      6.3 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0
                  poorly = True              neg : pos    =      6.0 : 1.0
                  justin = True              neg : pos    =      5.8 : 1.0
                 singers = True              pos : neg    =      5.7 : 1.0

Pickle to save the algorithm

import pickle
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

Load

classifier_f= open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
nltk.classify.accuracy(classifier, test_x)
0.81

Integration with Sklearn

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB # not binary but shade

Multinomial Naive Bayes

Multinom_classifier = SklearnClassifier(MultinomialNB())
Multinom_classifier.train(train_x)
print("accuracy", nltk.classify.accuracy(Multinom_classifier, test_x))
accuracy 0.84

Bernoulli Naive Bayes

Bernoulli_classifier = SklearnClassifier(BernoulliNB())
Bernoulli_classifier.train(train_x)
print("accuracy", nltk.classify.accuracy(Bernoulli_classifier, test_x))
accuracy 0.8

Whatever models

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
SVC_class = SklearnClassifier(SVC())
SVC_class.train(train_x)
print("accuracy", nltk.classify.accuracy(SVC_class, test_x))
accuracy 0.88

Combining models

from nltk.classify import ClassifierI
from statistics import mode
Multinom_classifier = SklearnClassifier(MultinomialNB())
Multinom_classifier.train(train_x)

Bernoulli_classifier = SklearnClassifier(BernoulliNB())
Bernoulli_classifier.train(train_x)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_x)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_x)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train_x)
<SklearnClassifier(NuSVC())>
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self.classifiers = classifiers
    
    def classify(self,features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_vote =  votes.count(mode(votes))
        return (choice_vote/len(votes))
    
voted_classifier = VoteClassifier(Multinom_classifier, 
                                  Bernoulli_classifier, 
                                  SGDClassifier_classifier, 
                                  LinearSVC_classifier, 
                                  NuSVC_classifier)
voted_classifier.classifiers
(<SklearnClassifier(MultinomialNB())>,
 <SklearnClassifier(BernoulliNB())>,
 <SklearnClassifier(SGDClassifier())>,
 <SklearnClassifier(LinearSVC())>,
 <SklearnClassifier(NuSVC())>)
print("Accuracy of bouch of", nltk.classify.accuracy(voted_classifier, test_x))
Accuracy of bouch of 0.88
new_obs = {"suks":True, "works":False, "good":True, "uni":True,"palace":True}
print(voted_classifier.classify(new_obs), voted_classifier.confidence(new_obs))
neg 0.8

to understand which word are relevant to our analysis we have to see what are inside the dictionary:

classifier.show_most_informative_features(15)
Most Informative Features
               atrocious = True              neg : pos    =     11.6 : 1.0
                   sucks = True              neg : pos    =      9.7 : 1.0
                     ugh = True              neg : pos    =      9.6 : 1.0
                  annual = True              pos : neg    =      9.1 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                    yawn = True              neg : pos    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.5 : 1.0
                obstacle = True              pos : neg    =      6.4 : 1.0
                 cunning = True              pos : neg    =      6.4 : 1.0
                  crappy = True              neg : pos    =      6.3 : 1.0

Short sentece dataset - Recap

"https://pythonprogramming.net/static/downloads/short_reviews/"
import requests
negative = requests.get("https://pythonprogramming.net/static/downloads/short_reviews/negative.txt").text
positive = requests.get("https://pythonprogramming.net/static/downloads/short_reviews/positive.txt").text

Create whole documents with associated label

documents = []
for row in negative.split("\n"):
    documents.append((row, "Negative"))
for row in positive.split("\n"):
    documents.append((row, "Positive"))

rd.shuffle(documents)
documents[0]
('predictable and cloying , though brown sugar is so earnest in its yearning for the days before rap went nihilistic that it summons more spirit and bite than your average formulaic romantic quadrangle . ',
 'Negative')

Create a Frequency distribution of the words

all_words = []
negative_word = nltk.tokenize.word_tokenize(negative)
positive_word = nltk.tokenize.word_tokenize(positive)

common = set(stopwords.words("english"))
punctuation = set([",",".",":",";","(",")","!","?","'"," \" ", "-" ])
for w in negative_word:
    if w not in punctuation and w not in common:
        all_words.append(w.lower())
for w in positive_word:
    if w not in punctuation and w not in common:
        all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(5))
print(len(all_words))

word_features = list(all_words.keys())[:6000] # we take the first 6000
[("'s", 3537), ('film', 1590), ('movie', 1336), ("n't", 940), ('one', 739)]
20162

Create a feature set

def find_features(document):
    words = nltk.word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

feature_set = [(find_features(rev), category) for (rev, category) in documents ]
train_x = feature_set[:10000]
test_x = feature_set[10000:]
#18
# it took around 30m to fit all the models.
train = False
if train:
  Multinom_classifier = SklearnClassifier(MultinomialNB())
  Multinom_classifier.train(train_x)
  Bernoulli_classifier = SklearnClassifier(BernoulliNB())
  Bernoulli_classifier.train(train_x)
  SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
  SGDClassifier_classifier.train(train_x)
  LinearSVC_classifier = SklearnClassifier(LinearSVC())
  LinearSVC_classifier.train(train_x)
  NuSVC_classifier = SklearnClassifier(NuSVC())
  NuSVC_classifier.train(train_x)
  class VoteClassifier(ClassifierI):
      def __init__(self, *classifiers):
          self.classifiers = classifiers
      def classify(self,features):
          votes = []
          for c in self.classifiers:
              v = c.classify(features)
              votes.append(v)
          return mode(votes)
      def confidence(self, features):
          votes = []
          for c in self.classifiers:
              v = c.classify(features)
              votes.append(v)
          choice_vote =  votes.count(mode(votes))
          return (choice_vote/len(votes))
      
  voted_classifier = VoteClassifier(Multinom_classifier, 
                                    Bernoulli_classifier, 
                                    SGDClassifier_classifier, 
                                    LinearSVC_classifier, 
                                    NuSVC_classifier)

Experiments

Already implemented a trained algortimhs

it will be slower as f. but at least it consider multiple words! Nope does not work, the accuracy is under 0.5

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
print(sid.polarity_scores("i just want something that works with italian words"))
print(sid.polarity_scores("i just do not love you"))
x = sid.polarity_scores(" get my weed my from california that’s that shit")
print(x)
{'neg': 0.0, 'neu': 0.843, 'pos': 0.157, 'compound': 0.0772}
{'neg': 0.457, 'neu': 0.543, 'pos': 0.0, 'compound': -0.5216}
{'neg': 0.31, 'neu': 0.69, 'pos': 0.0, 'compound': -0.5574}
score,value = 0,0
for tupla in list(x.items()):
    if tupla[1] > score:
        score = tupla[1]
        value = tupla[0]
print(score,value)      
0.69 neu