NLP Python Tutorial Flashcards
Function to download nltk corpora
nltk.download()
Import to tokenize sentences
from nltk.tokenize import sent_tokenize
Import to tokenize words
from nltk.tokenize import word_tokenize
Function to tokenize sentences
print(sent_tokenize())
Function to tokenize sentences example
print(sent_tokenize(text))
Function to tokenize words
print(word_tokenize())
Function to tokenize words example
print(word_tokenize(text))
Import for stop words
from nltk.corpus import stopwords
Turning stop words into a variable
stop_words = set(stopwords.words(‘english’))
Turning tokenized text into a variable
word_tokens = word_tokenize()
Turning tokenized text into a variable example
word_tokens = word_tokenize(example)
Function for removing stop words
sentence = [w for w in word_tokens if not w in stop_words]
Stemming import
from nltk.stem import PorterStemmer
Turning stemmer into a variable
ps = PorterStemmer()
Function for stemming
for w in:
print(ps.stem(w))
Function for stemming example
for w in example_words:
print(ps.stem(w))
Stemming an entire sentence steps
- Tokenize
2. Stem
Stemming an entire sentence example
words = word_tokenize(new_text)
for w in words:
print(ps.stem(w))
Importing an nltk text
from nltk.corpus import
Importing an nltk text example
from nltk.corpus import udhr
Import for the PunktSentenceTokenizer
from nltk.tokenize import PunktSentenceTokenizer
Training the PunktSentenceTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
Run sample text through PunktSentenceTokenizer
tokenized = custom_sent_tokenizer.tokenize(sample_text)
Tag each tokenized word with a part of speech steps
- For loop
- Tokenize words
- Tag words
- Print tagged words
Tag each tokenized word with a part of speech example
for i in tokenized[:5]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
print(tagged)
Chunking 3 steps
- chunkgram =
- chunkParser =
- chunked =
chunkGram
chunkGram = r”””Chunk: { }”””
chunkGram example
chunkGram = r”””Chunk: {**+?}”””
chunkParser
chunkParser = nltk.RegexpParser()
chunkParser example
chunkParser = nltk.RegexpParser(chunkGram)
chunked
chunked = chunkParser.parse(tagged)
Print the nltk tree
for subtree in chunked.subtrees(filter=lambda t: t.label() == ‘Chunk’):
print(subtree)
Draw chunks with nltk
chunked.draw()
namedEnt definition
Marks proper nouns as organizations or people or money, etc
namedEnt steps
- Tokenize
- pos Tagging
- namedEnt
namedEnt example
namedEnt = nltk.ne_chunk(tagged, binary=false)
chunkGram for chinking
chunkGram = r”””Chunk: {+}
}+{“”””
Build a list of documents
documents = [list( .words(fileid)), category)
for category in .categories()
for fileid in .fileids(category)]
Build a list of documents example
documents = [list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
Shuffle the documents
random.shuffle(documents)
Print the number of documents
print(‘Number of Documents {}’.format(len(documents)))
Print the first review
print(‘First Review: {}’.format(documents[0]))
Write all the words in the reviews
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
Create a FreqDist
all_words = nltk.FreqDist(all_words)
Find the most common words
print(‘Most common words: {}’.format(all_words.most_common(15)))
Find how many times the word “happy” is used
print(‘the word happy: {}’.format(all_words[“happy”]))
Find how many words are in the text
print(len(all_words))
Use the 4000 most common words as features
word_features = list(all_words.keys())[:4000]
Find features steps
- define function
- prep
- for loop
- return features
Define find features
def find_features(document):
Find features prep
words = set(document) features = {}
Find features for loop
for w in word_features:
features[w] = (w in words)
Find features example steps
- prep
- for loop
- print(features)
Find features example prep
features = find_features(.words(‘ ’))
Find features example prep example
features = find_features(movie_reviews.words(‘neg/cv000_29416.txt’))
Find features example for loop steps
- for loop
- if clause
- print(keys)
Find features example for loop
for key, value in features.items():
Find features example if clause
if value == True:
print(key)
Find features for all documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]
Import model selection
from sklearn import model_selection
Define a seed for reproducibility
seed = 1
Split featuresets into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)
Import sklearn classifier
from nltk.classify.scikitlearn import SklearnClassifier
import SVC
from sklearn.svm import SVC
Define the SVC model
model = SklearnClassifier(SVC(kernel = ‘linear’))
Train the model on the training data
model.train(training)
Find the accuracy of the SVC model steps
- define accuracy
2. print
Define accuracy
accuracy = nltk.classify.accuracy(model, testing)
Print accuracy
print(‘SVC Accuracy: {}’.format(accuracy))